Compare revisions

Mahler, Lukas · Mahler, Lukas · Mahler, Lukas · Mahler, Lukas · Mahler, Lukas · Mahler, Lukas
--- a/config/example-config_dbrepo.yml
+++ b/config/example-config_dbrepo.yml
 host: https://dbrepo1.ec.tuwien.ac.at
-database-id: <insert database id>
+database-id: 23
 credentials:
  username: <insert username from dbrepo>
  password: <insert password from dbrepo>

--- a/config/example-config_invenio.yml
+++ b/config/example-config_invenio.yml
 host: https://researchdata.tuwien.ac.at
 credentials:
-  token: <insert token from invenio>
\ No newline at end of file
+  token: <insert token of InvenioRDM API>
\ No newline at end of file
--- a/fairnb/api/dbrepo.py
+++ b/fairnb/api/dbrepo.py
@@ -12,7 +12,7 @@ from keycloak import KeycloakOpenID

 LOG = logging.getLogger(__name__)
 TIMEOUT = 600
-CHUNK_SIZE = 1024 * 1024 * 100
+CHUNK_SIZE = 1024 * 1024 * 20

 def re_auth(func: Callable) -> Callable:
    @wraps(func)
@@ -210,10 +210,10 @@ class DBRepoConnector:
        return None

    @re_auth
-    def create_table(self, dataframe: pd.DataFrame, table_name: str, table_descriptor: str):
+    def create_table(self, dataframe: pd.DataFrame, table_name: str, table_description: str):
        """ Creates a new table """

-        data = self._create_table_data(dataframe, table_name, table_descriptor)
+        data = self._create_table_data(dataframe, table_name, table_description)

        response = requests.post(
            f"{self.host}/api/database/{self.database_id}/table",
@@ -234,10 +234,10 @@ class DBRepoConnector:
    def create_table_if_not_exists(self,
                                   dataframe: pd.DataFrame,
                                   table_name: str,
-                                   table_descriptor: str
+                                   table_description: str
                                   ):
        table = table if (table := self.get_table(table_name)) is not None else \
-            self.create_table(dataframe, table_name, table_descriptor)
+            self.create_table(dataframe, table_name, table_description)

        return table

@@ -269,6 +269,7 @@ class DBRepoConnector:
                "quote": '"',
                "separator": ",",
                "skip_lines": 1,
+                "line_termination": "\n",
                "true_element": "True"
            },
            headers=self.headers

--- a/fairnb/api/invenio.py
+++ b/fairnb/api/invenio.py
@@ -8,7 +8,7 @@ import requests as rq
 log = logging.getLogger(__name__)


-class InvenioConnector:
+class InvenioRDMConnector:
    def __init__(self, token: str, host: str | None = None):
        self.host = host or "https://test.researchdata.tuwien.ac.at"
        self.token = token
@@ -130,14 +130,14 @@ class InvenioConnector:
            executor.map(lambda p: self.download_file(*p), args)


-class InvenioManager:
+class InvenioRDMManager:
    """A high level interface to up- and download files from invenio.
    Utilizes state management:
        1. record not assigned
        2. record assigned
        3. record published"""

-    def __init__(self, invenio_connector: InvenioConnector, record_id: str | None = None):
+    def __init__(self, invenio_connector: InvenioRDMConnector, record_id: str | None = None):
        self.connector = invenio_connector
        self.record_id = record_id
        self.doi = None

--- a/fairnb/entity/dbrepo_entity.py
+++ b/fairnb/entity/dbrepo_entity.py
@@ -9,7 +9,7 @@ from fairnb.entity.entity_provenance import EntityProvenance


 @dataclass
-class DbRepoEntity(Entity):
+class DBRepoEntity(Entity):
    table_name: str = field(init=True, default=None)
    table_description: str = field(init=True, default="")
    table_id: int = field(init=False, default=None)
@@ -19,7 +19,7 @@ class DbRepoEntity(Entity):
        super().__post_init__()

        if self.metadata is not None:  # equivalent to: self.id is not None
-            self.table_id = int(self.metadata.uri.split("/")[-1])
+            self.table_id = int(self.metadata.pi.split("/")[-1])
        else:
            assert self.table_name is not None  # has to exist fot the ability to get table_id

@@ -60,9 +60,10 @@ class DbRepoEntity(Entity):
        self.location.resolve().parent.mkdir(parents=True, exist_ok=True)
        df.to_csv(self.location, index=False)

-    def upload(self, executed_file: Path, dependencies: list[Entity] = None,
+    def upload(self, executed_file: Path, main_file: Path,
+               dependencies: list[Entity] = None,
               start_time: datetime = datetime.now(),
-               end_time: datetime = datetime.now()):
+               end_time: datetime = datetime.now()) -> EntityProvenance:
        df = pd.read_csv(self.location)

        # add id column to df:
@@ -80,10 +81,11 @@ class DbRepoEntity(Entity):
            self.name,
            self.description,
            executed_file=executed_file,
-            uri=f"{self.dbrepo_connector.host}/database/"
+            main_file=main_file,
+            pi=f"{self.dbrepo_connector.host}/database/"
                f"{self.dbrepo_connector.database_id}/table/{self.table_id}",
            type=self.type,
-            platform=self.repository,
+            repository=self.repository,
            started_at=start_time,
            ended_at=end_time
        )
@@ -98,6 +100,8 @@ class DbRepoEntity(Entity):

        self.upload_dependencies(dependencies)

+        return self.metadata
+
    def upload_data(self, df: pd.DataFrame):
        assert self.id is not None
        assert self.table_id is not None

--- a/fairnb/entity/entity.py
+++ b/fairnb/entity/entity.py
@@ -11,12 +11,13 @@ from fairnb.api.dbrepo import DBRepoConnector
 from fairnb.entity.entity_provenance import EntityProvenance


-PROVENANCE_TABLE_NAME = "entity_provenance_test3"
-DEPENDENCY_TABLE_NAME = "entity_dependencies_test3"
+PROVENANCE_TABLE_NAME = "entity_provenance"
+DEPENDENCY_TABLE_NAME = "entity_dependencies"

 LOG = logging.getLogger(__name__)
 # TODO: Upload Datetime objects as Timestamps instead of str

+
 @dataclass
 class Entity(ABC):
    """ A O-Prov Entity class used to represent an Entity created by a notebook.
@@ -65,12 +66,12 @@ class Entity(ABC):
        self.download_provenance()

    @abstractmethod
-    def download(self) -> EntityProvenance:
+    def download(self):
        """Download this Entity and return the attached EntityProvenance"""
        raise NotImplementedError

    @abstractmethod
-    def upload(self, executed_file: Path, dependencies: list, started_at=datetime.now(), ended_at=datetime.now()):
+    def upload(self, executed_file: Path, main_file: Path, dependencies: list, started_at=datetime.now(), ended_at=datetime.now()):
        """Upload this Entity"""
        raise NotImplementedError

@@ -131,6 +132,8 @@ class Entity(ABC):
        self.id = meta.id
        self.metadata = meta

+        LOG.info(f"Uploaded provenance information for {self.name} with id {self.id}: {self.metadata}")
+
    def upload_dependencies(self, dependencies):
        """ Upload the dependency information for this Entity.
        It lists all entities, which have an id, this entity depends on.

--- a/fairnb/entity/entity_provenance.py
+++ b/fairnb/entity/entity_provenance.py
@@ -15,7 +15,7 @@ class EntityProvenance:
    """

    id: str | None  # id of entity, always unique
-    uri: str  # unique resource identifier used to locate entity (can also be used to point to table containing entity)
+    pi: str  # persistent identifier used to locate entity (can also be used to point to table containing entity)
    name: str  # name of specific entity describing the data it contains
    description: str  # more detailed description of the enitity
    type: str  # type of entity, if notebook is run with different data type stays the same
@@ -23,9 +23,10 @@ class EntityProvenance:
    branch: str  # the branch of the repository, makes manual search of commit easier
    repo_uri: str  # the uri of the repository, used to locate the repository
    executed_file: str  # path to notebook which was executed to create the entity
-    started_at: datetime # start time of execution where entity was created
-    ended_at: datetime # end time of execution where entity was created
-    platform: str  # platform on which the entity is uploaded (e.g. dbrepo, invenio, ...)
+    main_file: str  # path to the main file executing the notebook
+    started_at: datetime  # start time of execution where entity was created
+    ended_at: datetime  # end time of execution where entity was created
+    repository: str  # platform on which the entity is uploaded (e.g. dbrepo, invenio, ...)

    @classmethod
    def new(
@@ -33,9 +34,10 @@ class EntityProvenance:
            name: str,
            description: str,
            executed_file: Path,
+            main_file: Path,
            type: str,
-            uri: str,
-            platform: str,
+            pi: str,
+            repository: str,
            started_at: datetime,
            ended_at: datetime
    ):
@@ -50,20 +52,22 @@ class EntityProvenance:
            repo_uri = re.sub(":\d+/", "/", f"https://{repo_uri.split('@', 1)[1]}")

        executed_file_rel = executed_file.resolve().relative_to(BASE_PATH)
+        main_file_rel = main_file.resolve().relative_to(BASE_PATH)

        return cls(
            id=None,
            name=name,
            description=description,
-            uri=uri,
+            pi=pi,
            commit=commit,
            repo_uri=repo_uri,
            started_at=started_at,
            ended_at=ended_at,
            branch=branch,
            executed_file=executed_file_rel.as_posix(),
+            main_file=main_file_rel.as_posix(),
            type=type,
-            platform=platform,
+            repository=repository,
        )

    @classmethod
@@ -72,10 +76,11 @@ class EntityProvenance:
            id=df["id"],
            name=df["name"],
            description=df["description"],
-            uri=df["uri"],
+            pi=df["pi"],
            commit=df["commit"],
            repo_uri=df["git_uri"],
            executed_file=df["executed_file"],
+            main_file=df["main_file"],
            started_at=datetime.strptime(
                df["started_at"], "%Y-%m-%d %H:%M:%S.%f"
            ),  # TODO: replace with '%F %T'
@@ -84,7 +89,7 @@ class EntityProvenance:
            ),
            branch=df["branch"],
            type=df["type"],
-            platform=df["repository"],
+            repository=df["repository"],
        )

    def to_frame(self):
@@ -93,14 +98,15 @@ class EntityProvenance:
                "id": pd.Series(self.id, dtype=str),
                "name": pd.Series(self.name, dtype=str),
                "description": pd.Series(self.description, dtype=str),
-                "uri": pd.Series(self.uri, dtype=str),
+                "pi": pd.Series(self.pi, dtype=str),
                "commit": pd.Series(self.commit, dtype=str),
                "git_uri": pd.Series(self.repo_uri, dtype=str),
                "executed_file": pd.Series(self.executed_file, dtype=str),
+                "main_file": pd.Series(self.main_file, dtype=str),
                "started_at": pd.Series(self.started_at, dtype=str),
                "ended_at": pd.Series(self.ended_at, dtype=str),
                "branch": pd.Series(self.branch, dtype=str),
                "type": pd.Series(self.type, dtype=str),
-                "repository": pd.Series(self.platform, dtype=str),
+                "repository": pd.Series(self.repository, dtype=str),
            }
        )
--- a/fairnb/entity/invenio_entity.py
+++ b/fairnb/entity/invenio_entity.py
@@ -3,14 +3,14 @@ from datetime import datetime
 from pathlib import Path

 from fairnb.api.dbrepo import DBRepoConnector
-from fairnb.api.invenio import InvenioManager, InvenioConnector
+from fairnb.api.invenio import InvenioRDMManager, InvenioRDMConnector
 from fairnb.entity.entity import Entity
 from fairnb.entity.entity_provenance import EntityProvenance


 @dataclass
-class InvenioEntity(Entity):
-    invenio_manager: InvenioManager = field(init=True, default=None)
+class InvenioRDMEntity(Entity):
+    invenio_manager: InvenioRDMManager = field(init=True, default=None)
    record_metadata: dict = field(init=True, default=None)
    publish_record: bool = field(init=True, default=False)
    platform: str = field(init=False, default="https://doi.org/10.17616/R31NJMYD")
@@ -24,11 +24,11 @@ class InvenioEntity(Entity):
            description: str,
            type: str,
            dbrepo_connector: DBRepoConnector,
-            invenio_connector: InvenioConnector,
+            invenio_connector: InvenioRDMConnector,
            publish_record: bool = False,
    ):
        return cls(
-            invenio_manager=InvenioManager(invenio_connector),
+            invenio_manager=InvenioRDMManager(invenio_connector),
            record_metadata=record_metadata,
            dbrepo_connector=dbrepo_connector,
            location=location,
@@ -44,13 +44,13 @@ class InvenioEntity(Entity):
            id: str,
            location: Path,
            dbrepo_connector: DBRepoConnector,
-            invenio_connector: InvenioConnector,
+            invenio_connector: InvenioRDMConnector,
    ):
        return cls(
            id=id,
            location=location,
            dbrepo_connector=dbrepo_connector,
-            invenio_manager=InvenioManager(invenio_connector)
+            invenio_manager=InvenioRDMManager(invenio_connector)
        )

    def __post_init__(self):
@@ -60,9 +60,10 @@ class InvenioEntity(Entity):
            assert self.record_metadata is not None
            return

-        self.invenio_manager.record_id = self.metadata.uri.split('/')[-1]
+        self.invenio_manager.record_id = self.metadata.pi.split('/')[-1]

-    def upload(self, executed_file: Path, dependencies: list[Entity] = None, started_at=datetime.now(), ended_at=datetime.now()):
+    def upload(self, executed_file: Path, main_file: Path,
+               dependencies: list[Entity] = None, started_at=datetime.now(), ended_at=datetime.now()):
        dir_path: Path
        regex: str

@@ -91,9 +92,10 @@ class InvenioEntity(Entity):
            name=self.name,
            description=self.description,
            executed_file=executed_file,
-            uri=uri.replace('/api', ''),
+            main_file=main_file,
+            pi=uri.replace('/api', ''),
            type=self.type,
-            platform=self.platform,
+            repository=self.platform,
            started_at=started_at,
            ended_at=ended_at,
        )

--- a/fairnb/executor.py
+++ b/fairnb/executor.py
@@ -73,6 +73,7 @@ class Executor:
            # use inspect to get path of caller
            entity.upload(
                nb_config.nb_location,
+                nb_config.main_location,
                nb_config.dependencies,
                nb_config.started_at,
                nb_config.ended_at

--- a/fairnb/nb_config.py
+++ b/fairnb/nb_config.py
@@ -8,6 +8,7 @@ from fairnb.entity.entity import Entity
 @dataclass
 class NbConfig:
    nb_location: Path
+    main_location: Path
    entities: list[Entity]
    dependencies: list[Entity]
    nb_output_location: Path = field(init=True, default=None)

--- a/fairnb/util.py
+++ b/fairnb/util.py
@@ -5,7 +5,7 @@ import pandas as pd
 import tarfile

 from fairnb.api.dbrepo import DBRepoConnector
-from fairnb.api.invenio import InvenioManager, InvenioConnector
+from fairnb.api.invenio import InvenioRDMManager, InvenioRDMConnector
 from definitions import CONFIG_PATH
 import yaml

@@ -46,14 +46,14 @@ class Util:

    def get_invenio_connector(self, path: pathlib.Path = None):
        config = self.get_config(path=path)
-        return InvenioConnector(
+        return InvenioRDMConnector(
            token=config["credentials"]["token"],
            host=config["host"]
        )

    def get_invenio_manager(self, path: pathlib.Path = None):
        config = self.get_config(path=path)
-        return InvenioManager(
+        return InvenioRDMManager(
            self.get_invenio_connector(path=path)
        )


--- a/notebooks/1_audio_files.ipynb
+++ b/notebooks/1_audio_files.ipynb
 %% Cell type:markdown id:4389a8092677254e tags:

 # Audio Files

 Bundle the provided audio files (400, in MP3) in a tar, encrypt it using gzip and store it in the output folder.

 %% Cell type:code id:87ab37c6 tags:

 ``` python
 from definitions import BASE_PATH
 import tarfile
 import zipfile
 import os
 from pathlib import Path
 ```

 %% Cell type:code id:1b4e6b01 tags:parameters

 ``` python
 # Parameters
 INPUT_PATHS = {}
 OUTPUT_PATHS = {
    "audio_tar": str(BASE_PATH / "tmp/1_audio_files/output/emotifymusic.tar.gz")
 }
 ```

-%% Cell type:code id:15dea136 tags:injected-parameters
+%% Cell type:code id:1a6df3b0 tags:injected-parameters

 ``` python
 # Parameters
 INPUT_PATHS = {}
 OUTPUT_PATHS = {
-    "audio_tar": "/home/lukas/Programming/uni/bachelorarbeit/fairnb/tmp/1_audio_files/output/emotifymusic.tar.gz"
+    "audio_tar": "/home/lukas/Programming/uni/bachelorarbeit/dbrepo-ismir/tmp/1_audio_files/output/emotifymusic.tar.gz"
 }
 ```

 %% Cell type:code id:1e487573 tags:

 ``` python
 # load provided files
 zip_path = BASE_PATH / "resource" / "1_audio_files" / "emotifymusic.zip"
 dir_path = BASE_PATH / "tmp" / "1_audio_files" / "music"

 dir_path.mkdir(parents=True, exist_ok=True)
 # unzip to dir_path
 with zipfile.ZipFile(zip_path, "r") as zfile:
    zfile.extractall(path=dir_path)
 ```

 %% Cell type:code id:c3193f35 tags:

 ``` python
 file_paths = list(dir_path.rglob('**/*.*'))
 flattened_dir_path = BASE_PATH / "tmp" / "1_audio_files" / "flattened"
 flattened_dir_path.mkdir(parents=True, exist_ok=True)

 for path in file_paths:
    (flattened_dir_path / path.relative_to(dir_path).as_posix().replace('/', '_')).write_bytes(path.read_bytes())
 ```

 %% Cell type:code id:3272ea2b tags:

 ``` python
 tar_path = Path(OUTPUT_PATHS["audio_tar"])
 tar_path.parent.mkdir(parents=True, exist_ok=True)

 with tarfile.open(tar_path, "w:gz") as file:
    file.add(flattened_dir_path, arcname=os.path.sep)
 ```

 %% Cell type:markdown id:4389a8092677254e tags:

 # Audio Files

 Bundle the provided audio files (400, in MP3) in a tar, encrypt it using gzip and store it in the output folder.

 %% Cell type:code id:87ab37c6 tags:

 ``` python
 from definitions import BASE_PATH
 import tarfile
 import zipfile
 import os
 from pathlib import Path
 ```

 %% Cell type:code id:1b4e6b01 tags:parameters

 ``` python
 # Parameters
 INPUT_PATHS = {}
 OUTPUT_PATHS = {
    "audio_tar": str(BASE_PATH / "tmp/1_audio_files/output/emotifymusic.tar.gz")
 }
 ```

-%% Cell type:code id:15dea136 tags:injected-parameters
+%% Cell type:code id:1a6df3b0 tags:injected-parameters

 ``` python
 # Parameters
 INPUT_PATHS = {}
 OUTPUT_PATHS = {
-    "audio_tar": "/home/lukas/Programming/uni/bachelorarbeit/fairnb/tmp/1_audio_files/output/emotifymusic.tar.gz"
+    "audio_tar": "/home/lukas/Programming/uni/bachelorarbeit/dbrepo-ismir/tmp/1_audio_files/output/emotifymusic.tar.gz"
 }
 ```

 %% Cell type:code id:1e487573 tags:

 ``` python
 # load provided files
 zip_path = BASE_PATH / "resource" / "1_audio_files" / "emotifymusic.zip"
 dir_path = BASE_PATH / "tmp" / "1_audio_files" / "music"

 dir_path.mkdir(parents=True, exist_ok=True)
 # unzip to dir_path
 with zipfile.ZipFile(zip_path, "r") as zfile:
    zfile.extractall(path=dir_path)
 ```

 %% Cell type:code id:c3193f35 tags:

 ``` python
 file_paths = list(dir_path.rglob('**/*.*'))
 flattened_dir_path = BASE_PATH / "tmp" / "1_audio_files" / "flattened"
 flattened_dir_path.mkdir(parents=True, exist_ok=True)

 for path in file_paths:
    (flattened_dir_path / path.relative_to(dir_path).as_posix().replace('/', '_')).write_bytes(path.read_bytes())
 ```

 %% Cell type:code id:3272ea2b tags:

 ``` python
 tar_path = Path(OUTPUT_PATHS["audio_tar"])
 tar_path.parent.mkdir(parents=True, exist_ok=True)

 with tarfile.open(tar_path, "w:gz") as file:
    file.add(flattened_dir_path, arcname=os.path.sep)
 ```

--- a/notebooks/2_generate_features.ipynb
+++ b/notebooks/2_generate_features.ipynb
--- a/notebooks/3_aggregate_features.ipynb
+++ b/notebooks/3_aggregate_features.ipynb
--- a/notebooks/4_split.ipynb
+++ b/notebooks/4_split.ipynb
 %% Cell type:markdown id:e92b4fe9 tags:

 # Split the Features into Train and Test Set

 %% Cell type:code id:5f1fae44 tags:

 ``` python
 import pandas as pd
 from pathlib import Path
 from definitions import BASE_PATH
 ```

 %% Cell type:code id:01de1b27 tags:parameters

 ``` python
 # Tagged with 'parameters'
 from definitions import BASE_PATH

 INPUT_PATHS: dict[str, str] = {
    "features": (BASE_PATH / "tmp" / "4_split" / "input" / "features.csv").__str__()
 }
 OUTPUT_PATHS: dict[str, str] = {
    "split": (BASE_PATH / "tmp" / "4_split" / "output" / "split.csv").__str__()
 }
 ```

-%% Cell type:code id:e99ca0ba tags:injected-parameters
+%% Cell type:code id:fdc0a0a6 tags:injected-parameters

 ``` python
 # Parameters
 INPUT_PATHS = {
-    "aggregated_features": "/home/lukas/Programming/uni/bachelorarbeit/fairnb/tmp/4_split/input/features.csv"
+    "aggregated_features": "/home/lukas/Programming/uni/bachelorarbeit/dbrepo-ismir/tmp/4_split/input/features.csv"
 }
 OUTPUT_PATHS = {
-    "split": "/home/lukas/Programming/uni/bachelorarbeit/fairnb/tmp/4_split/output/split.csv"
+    "split": "/home/lukas/Programming/uni/bachelorarbeit/dbrepo-ismir/tmp/4_split/output/split.csv"
 }
 ```

 %% Cell type:code id:a4cc6800 tags:

 ``` python
 # INPUT

 for path in INPUT_PATHS.values():
    assert Path(path).exists()

 features = pd.read_csv(INPUT_PATHS["aggregated_features"])
 ```

 %% Cell type:code id:a186d0c4 tags:

 ``` python
-train = features.sample(frac=0.8).sort_index()
+train = features.sample(frac=0.8, random_state=11908553).sort_index()
 test = features.drop(train.index)

 split_true = pd.DataFrame({
    "filename": train.filename,
    "train": True
 })
 split_false = pd.DataFrame({
    "filename": test.filename,
    "train": False
 })

 split_concat = pd.concat([split_true, split_false])\
    .sort_values("filename")\
    .reset_index(drop=True)
 ```

 %% Cell type:code id:091e0641 tags:

 ``` python
 split_concat
 ```

-%% Output
-
-                  filename  train
-    0      classical_1.mp3  False
-    1     classical_10.mp3   True
-    2    classical_100.mp3  False
-    3     classical_11.mp3   True
-    4     classical_12.mp3   True
-    ..                 ...    ...
-    395        rock_95.mp3  False
-    396        rock_96.mp3   True
-    397        rock_97.mp3   True
-    398        rock_98.mp3   True
-    399        rock_99.mp3   True
-    
-    [400 rows x 2 columns]
-
 %% Cell type:code id:7b11b8bb tags:

 ``` python
 # output
 OUTPUT_PATH = Path(OUTPUT_PATHS["split"])
 OUTPUT_PATH.parent.mkdir(parents=True, exist_ok=True)

 output = split_concat
 output.to_csv(OUTPUT_PATH, index=False)
 ```

 %% Cell type:markdown id:e92b4fe9 tags:

 # Split the Features into Train and Test Set

 %% Cell type:code id:5f1fae44 tags:

 ``` python
 import pandas as pd
 from pathlib import Path
 from definitions import BASE_PATH
 ```

 %% Cell type:code id:01de1b27 tags:parameters

 ``` python
 # Tagged with 'parameters'
 from definitions import BASE_PATH

 INPUT_PATHS: dict[str, str] = {
    "features": (BASE_PATH / "tmp" / "4_split" / "input" / "features.csv").__str__()
 }
 OUTPUT_PATHS: dict[str, str] = {
    "split": (BASE_PATH / "tmp" / "4_split" / "output" / "split.csv").__str__()
 }
 ```

-%% Cell type:code id:e99ca0ba tags:injected-parameters
+%% Cell type:code id:fdc0a0a6 tags:injected-parameters

 ``` python
 # Parameters
 INPUT_PATHS = {
-    "aggregated_features": "/home/lukas/Programming/uni/bachelorarbeit/fairnb/tmp/4_split/input/features.csv"
+    "aggregated_features": "/home/lukas/Programming/uni/bachelorarbeit/dbrepo-ismir/tmp/4_split/input/features.csv"
 }
 OUTPUT_PATHS = {
-    "split": "/home/lukas/Programming/uni/bachelorarbeit/fairnb/tmp/4_split/output/split.csv"
+    "split": "/home/lukas/Programming/uni/bachelorarbeit/dbrepo-ismir/tmp/4_split/output/split.csv"
 }
 ```

 %% Cell type:code id:a4cc6800 tags:

 ``` python
 # INPUT

 for path in INPUT_PATHS.values():
    assert Path(path).exists()

 features = pd.read_csv(INPUT_PATHS["aggregated_features"])
 ```

 %% Cell type:code id:a186d0c4 tags:

 ``` python
-train = features.sample(frac=0.8).sort_index()
+train = features.sample(frac=0.8, random_state=11908553).sort_index()
 test = features.drop(train.index)

 split_true = pd.DataFrame({
    "filename": train.filename,
    "train": True
 })
 split_false = pd.DataFrame({
    "filename": test.filename,
    "train": False
 })

 split_concat = pd.concat([split_true, split_false])\
    .sort_values("filename")\
    .reset_index(drop=True)
 ```

 %% Cell type:code id:091e0641 tags:

 ``` python
 split_concat
 ```

-%% Output
-
-                  filename  train
-    0      classical_1.mp3  False
-    1     classical_10.mp3   True
-    2    classical_100.mp3  False
-    3     classical_11.mp3   True
-    4     classical_12.mp3   True
-    ..                 ...    ...
-    395        rock_95.mp3  False
-    396        rock_96.mp3   True
-    397        rock_97.mp3   True
-    398        rock_98.mp3   True
-    399        rock_99.mp3   True
-    
-    [400 rows x 2 columns]
-
 %% Cell type:code id:7b11b8bb tags:

 ``` python
 # output
 OUTPUT_PATH = Path(OUTPUT_PATHS["split"])
 OUTPUT_PATH.parent.mkdir(parents=True, exist_ok=True)

 output = split_concat
 output.to_csv(OUTPUT_PATH, index=False)
 ```

--- a/notebooks/5_ml_model.ipynb
+++ b/notebooks/5_ml_model.ipynb
--- a/notebooks/main.ipynb
+++ b/notebooks/main.ipynb
--- a/notebooks/standalone.ipynb
+++ b/notebooks/standalone.ipynb
--- a/poetry.lock
+++ b/poetry.lock
@@ -123,13 +123,13 @@ files = [

 [[package]]
 name = "anyio"
-version = "4.2.0"
+version = "4.3.0"
 description = "High level compatibility layer for multiple asynchronous event loop implementations"
 optional = false
 python-versions = ">=3.8"
 files = [
-    {file = "anyio-4.2.0-py3-none-any.whl", hash = "sha256:745843b39e829e108e518c489b31dc757de7d2131d53fac32bd8df268227bfee"},
-    {file = "anyio-4.2.0.tar.gz", hash = "sha256:e1875bb4b4e2de1669f4bc7869b6d3f54231cdced71605e6e64c9be77e3be50f"},
+    {file = "anyio-4.3.0-py3-none-any.whl", hash = "sha256:048e05d0f6caeed70d731f3db756d35dcc1f35747c8c403364a8332c630441b8"},
+    {file = "anyio-4.3.0.tar.gz", hash = "sha256:f75253795a87df48568485fd18cdd2a3fa5c4f7c5be8e5e36637733fce06fed6"},
 ]

 [package.dependencies]
@@ -2149,13 +2149,13 @@ test = ["flaky", "ipykernel (>=6.19.3)", "ipython", "ipywidgets", "nbconvert (>=

 [[package]]
 name = "nbconvert"
-version = "7.16.0"
-description = "Converting Jupyter Notebooks"
+version = "7.16.1"
+description = "Converting Jupyter Notebooks (.ipynb files) to other formats.  Output formats include asciidoc, html, latex, markdown, pdf, py, rst, script.  nbconvert can be used both as a Python library (`import nbconvert`) or as a command line tool (invoked as `jupyter nbconvert ...`)."
 optional = false
 python-versions = ">=3.8"
 files = [
-    {file = "nbconvert-7.16.0-py3-none-any.whl", hash = "sha256:ad3dc865ea6e2768d31b7eb6c7ab3be014927216a5ece3ef276748dd809054c7"},
-    {file = "nbconvert-7.16.0.tar.gz", hash = "sha256:813e6553796362489ae572e39ba1bff978536192fb518e10826b0e8cadf03ec8"},
+    {file = "nbconvert-7.16.1-py3-none-any.whl", hash = "sha256:3188727dffadfdc9c6a1c7250729063d7bc78b355ad7aa023138afa030d1cd07"},
+    {file = "nbconvert-7.16.1.tar.gz", hash = "sha256:e79e6a074f49ba3ed29428ed86487bf51509d9aab613bd8522ac08f6d28fd7fd"},
 ]

 [package.dependencies]
@@ -2917,19 +2917,23 @@ typing-extensions = ">=4.6.0,<4.7.0 || >4.7.0"

 [[package]]
 name = "pydantic-settings"
-version = "2.1.0"
+version = "2.2.0"
 description = "Settings management using Pydantic"
 optional = false
 python-versions = ">=3.8"
 files = [
-    {file = "pydantic_settings-2.1.0-py3-none-any.whl", hash = "sha256:7621c0cb5d90d1140d2f0ef557bdf03573aac7035948109adf2574770b77605a"},
-    {file = "pydantic_settings-2.1.0.tar.gz", hash = "sha256:26b1492e0a24755626ac5e6d715e9077ab7ad4fb5f19a8b7ed7011d52f36141c"},
+    {file = "pydantic_settings-2.2.0-py3-none-any.whl", hash = "sha256:5f7bcaf9ad4419559dc5ac155c0324a9aeb2547c60471ee7c7d026f467a6b515"},
+    {file = "pydantic_settings-2.2.0.tar.gz", hash = "sha256:648d0a76673e69c51278979cba2e83cf16a23d57519bfd7e553d1c3f37db5560"},
 ]

 [package.dependencies]
 pydantic = ">=2.3.0"
 python-dotenv = ">=0.21.0"

+[package.extras]
+toml = ["tomlkit (>=0.12)"]
+yaml = ["pyyaml (>=6.0.1)"]
+
 [[package]]
 name = "pygments"
 version = "2.17.2"
@@ -3961,13 +3965,13 @@ dev = ["flake8", "flake8-annotations", "flake8-bandit", "flake8-bugbear", "flake

 [[package]]
 name = "urllib3"
-version = "2.2.0"
+version = "2.2.1"
 description = "HTTP library with thread-safe connection pooling, file post, and more."
 optional = false
 python-versions = ">=3.8"
 files = [
-    {file = "urllib3-2.2.0-py3-none-any.whl", hash = "sha256:ce3711610ddce217e6d113a2732fafad960a03fd0318c91faa79481e35c11224"},
-    {file = "urllib3-2.2.0.tar.gz", hash = "sha256:051d961ad0c62a94e50ecf1af379c3aba230c66c710493493560c0c223c49f20"},
+    {file = "urllib3-2.2.1-py3-none-any.whl", hash = "sha256:450b20ec296a467077128bff42b73080516e71b56ff59a60a02bef2232c4fa9d"},
+    {file = "urllib3-2.2.1.tar.gz", hash = "sha256:d0570876c61ab9e520d776c38acbbb5b05a776d3f9ff98a5c8fd5162a444cf19"},
 ]

 [package.extras]

--- a/resource/1_audio_files/record_metadata.yml
+++ b/resource/1_audio_files/record_metadata.yml
 access:
-  files: public
+  files: restricted
  record: public
 files:
  default_preview: null
@@ -24,13 +24,13 @@ metadata:
  publisher: TU Wien
  related_identifiers:
  - identifier: https://www2.projects.science.uu.nl/memotion/emotifydata/
-    relation:
+    relation_type:
      id: isderivedfrom
    resource_type:
      id: sound
    scheme: url
  - identifier: https://gitlab.tuwien.ac.at/martin.weise/fairnb
-    relation:
+    relation_type:
      id: isderivedfrom
    resource_type:
      id: software
No results found