Compare revisions

Mahler, Lukas · Mahler, Lukas · Mahler, Lukas · Mahler, Lukas · Mahler, Lukas · Mahler, Lukas
--- a/config/example-config_dbrepo.yml
+++ b/config/example-config_dbrepo.yml
 host: https://dbrepo1.ec.tuwien.ac.at
-database-id: <insert database id>
+database-id: 23
 credentials:
  username: <insert username from dbrepo>
  password: <insert password from dbrepo>

--- a/config/example-config_invenio.yml
+++ b/config/example-config_invenio.yml
 host: https://researchdata.tuwien.ac.at
 credentials:
-  token: <insert token from invenio>
\ No newline at end of file
+  token: <insert token of InvenioRDM API>
\ No newline at end of file
--- a/fairnb/api/dbrepo.py
+++ b/fairnb/api/dbrepo.py
@@ -12,7 +12,7 @@ from keycloak import KeycloakOpenID

 LOG = logging.getLogger(__name__)
 TIMEOUT = 600
-CHUNK_SIZE = 1024 * 1024 * 100
+CHUNK_SIZE = 1024 * 1024 * 20

 def re_auth(func: Callable) -> Callable:
    @wraps(func)
@@ -210,10 +210,10 @@ class DBRepoConnector:
        return None

    @re_auth
-    def create_table(self, dataframe: pd.DataFrame, table_name: str, table_descriptor: str):
+    def create_table(self, dataframe: pd.DataFrame, table_name: str, table_description: str):
        """ Creates a new table """

-        data = self._create_table_data(dataframe, table_name, table_descriptor)
+        data = self._create_table_data(dataframe, table_name, table_description)

        response = requests.post(
            f"{self.host}/api/database/{self.database_id}/table",
@@ -234,10 +234,10 @@ class DBRepoConnector:
    def create_table_if_not_exists(self,
                                   dataframe: pd.DataFrame,
                                   table_name: str,
-                                   table_descriptor: str
+                                   table_description: str
                                   ):
        table = table if (table := self.get_table(table_name)) is not None else \
-            self.create_table(dataframe, table_name, table_descriptor)
+            self.create_table(dataframe, table_name, table_description)

        return table

@@ -269,6 +269,7 @@ class DBRepoConnector:
                "quote": '"',
                "separator": ",",
                "skip_lines": 1,
+                "line_termination": "\n",
                "true_element": "True"
            },
            headers=self.headers

--- a/fairnb/api/invenio.py
+++ b/fairnb/api/invenio.py
@@ -8,7 +8,7 @@ import requests as rq
 log = logging.getLogger(__name__)


-class InvenioConnector:
+class InvenioRDMConnector:
    def __init__(self, token: str, host: str | None = None):
        self.host = host or "https://test.researchdata.tuwien.ac.at"
        self.token = token
@@ -130,14 +130,14 @@ class InvenioConnector:
            executor.map(lambda p: self.download_file(*p), args)


-class InvenioManager:
+class InvenioRDMManager:
    """A high level interface to up- and download files from invenio.
    Utilizes state management:
        1. record not assigned
        2. record assigned
        3. record published"""

-    def __init__(self, invenio_connector: InvenioConnector, record_id: str | None = None):
+    def __init__(self, invenio_connector: InvenioRDMConnector, record_id: str | None = None):
        self.connector = invenio_connector
        self.record_id = record_id
        self.doi = None

--- a/fairnb/entity/dbrepo_entity.py
+++ b/fairnb/entity/dbrepo_entity.py
@@ -9,7 +9,7 @@ from fairnb.entity.entity_provenance import EntityProvenance


 @dataclass
-class DbRepoEntity(Entity):
+class DBRepoEntity(Entity):
    table_name: str = field(init=True, default=None)
    table_description: str = field(init=True, default="")
    table_id: int = field(init=False, default=None)
@@ -19,7 +19,7 @@ class DbRepoEntity(Entity):
        super().__post_init__()

        if self.metadata is not None:  # equivalent to: self.id is not None
-            self.table_id = int(self.metadata.uri.split("/")[-1])
+            self.table_id = int(self.metadata.pi.split("/")[-1])
        else:
            assert self.table_name is not None  # has to exist fot the ability to get table_id

@@ -60,9 +60,10 @@ class DbRepoEntity(Entity):
        self.location.resolve().parent.mkdir(parents=True, exist_ok=True)
        df.to_csv(self.location, index=False)

-    def upload(self, executed_file: Path, dependencies: list[Entity] = None,
+    def upload(self, executed_file: Path, main_file: Path,
+               dependencies: list[Entity] = None,
               start_time: datetime = datetime.now(),
-               end_time: datetime = datetime.now()):
+               end_time: datetime = datetime.now()) -> EntityProvenance:
        df = pd.read_csv(self.location)

        # add id column to df:
@@ -80,10 +81,11 @@ class DbRepoEntity(Entity):
            self.name,
            self.description,
            executed_file=executed_file,
-            uri=f"{self.dbrepo_connector.host}/database/"
+            main_file=main_file,
+            pi=f"{self.dbrepo_connector.host}/database/"
                f"{self.dbrepo_connector.database_id}/table/{self.table_id}",
            type=self.type,
-            platform=self.repository,
+            repository=self.repository,
            started_at=start_time,
            ended_at=end_time
        )
@@ -98,6 +100,8 @@ class DbRepoEntity(Entity):

        self.upload_dependencies(dependencies)

+        return self.metadata
+
    def upload_data(self, df: pd.DataFrame):
        assert self.id is not None
        assert self.table_id is not None

--- a/fairnb/entity/entity.py
+++ b/fairnb/entity/entity.py
@@ -11,12 +11,13 @@ from fairnb.api.dbrepo import DBRepoConnector
 from fairnb.entity.entity_provenance import EntityProvenance


-PROVENANCE_TABLE_NAME = "entity_provenance_test3"
-DEPENDENCY_TABLE_NAME = "entity_dependencies_test3"
+PROVENANCE_TABLE_NAME = "entity_provenance"
+DEPENDENCY_TABLE_NAME = "entity_dependencies"

 LOG = logging.getLogger(__name__)
 # TODO: Upload Datetime objects as Timestamps instead of str

+
 @dataclass
 class Entity(ABC):
    """ A O-Prov Entity class used to represent an Entity created by a notebook.
@@ -65,12 +66,12 @@ class Entity(ABC):
        self.download_provenance()

    @abstractmethod
-    def download(self) -> EntityProvenance:
+    def download(self):
        """Download this Entity and return the attached EntityProvenance"""
        raise NotImplementedError

    @abstractmethod
-    def upload(self, executed_file: Path, dependencies: list, started_at=datetime.now(), ended_at=datetime.now()):
+    def upload(self, executed_file: Path, main_file: Path, dependencies: list, started_at=datetime.now(), ended_at=datetime.now()):
        """Upload this Entity"""
        raise NotImplementedError

@@ -131,6 +132,8 @@ class Entity(ABC):
        self.id = meta.id
        self.metadata = meta

+        LOG.info(f"Uploaded provenance information for {self.name} with id {self.id}: {self.metadata}")
+
    def upload_dependencies(self, dependencies):
        """ Upload the dependency information for this Entity.
        It lists all entities, which have an id, this entity depends on.

--- a/fairnb/entity/entity_provenance.py
+++ b/fairnb/entity/entity_provenance.py
@@ -15,7 +15,7 @@ class EntityProvenance:
    """

    id: str | None  # id of entity, always unique
-    uri: str  # unique resource identifier used to locate entity (can also be used to point to table containing entity)
+    pi: str  # persistent identifier used to locate entity (can also be used to point to table containing entity)
    name: str  # name of specific entity describing the data it contains
    description: str  # more detailed description of the enitity
    type: str  # type of entity, if notebook is run with different data type stays the same
@@ -23,9 +23,10 @@ class EntityProvenance:
    branch: str  # the branch of the repository, makes manual search of commit easier
    repo_uri: str  # the uri of the repository, used to locate the repository
    executed_file: str  # path to notebook which was executed to create the entity
-    started_at: datetime # start time of execution where entity was created
-    ended_at: datetime # end time of execution where entity was created
-    platform: str  # platform on which the entity is uploaded (e.g. dbrepo, invenio, ...)
+    main_file: str  # path to the main file executing the notebook
+    started_at: datetime  # start time of execution where entity was created
+    ended_at: datetime  # end time of execution where entity was created
+    repository: str  # platform on which the entity is uploaded (e.g. dbrepo, invenio, ...)

    @classmethod
    def new(
@@ -33,9 +34,10 @@ class EntityProvenance:
            name: str,
            description: str,
            executed_file: Path,
+            main_file: Path,
            type: str,
-            uri: str,
-            platform: str,
+            pi: str,
+            repository: str,
            started_at: datetime,
            ended_at: datetime
    ):
@@ -50,20 +52,22 @@ class EntityProvenance:
            repo_uri = re.sub(":\d+/", "/", f"https://{repo_uri.split('@', 1)[1]}")

        executed_file_rel = executed_file.resolve().relative_to(BASE_PATH)
+        main_file_rel = main_file.resolve().relative_to(BASE_PATH)

        return cls(
            id=None,
            name=name,
            description=description,
-            uri=uri,
+            pi=pi,
            commit=commit,
            repo_uri=repo_uri,
            started_at=started_at,
            ended_at=ended_at,
            branch=branch,
            executed_file=executed_file_rel.as_posix(),
+            main_file=main_file_rel.as_posix(),
            type=type,
-            platform=platform,
+            repository=repository,
        )

    @classmethod
@@ -72,10 +76,11 @@ class EntityProvenance:
            id=df["id"],
            name=df["name"],
            description=df["description"],
-            uri=df["uri"],
+            pi=df["pi"],
            commit=df["commit"],
            repo_uri=df["git_uri"],
            executed_file=df["executed_file"],
+            main_file=df["main_file"],
            started_at=datetime.strptime(
                df["started_at"], "%Y-%m-%d %H:%M:%S.%f"
            ),  # TODO: replace with '%F %T'
@@ -84,7 +89,7 @@ class EntityProvenance:
            ),
            branch=df["branch"],
            type=df["type"],
-            platform=df["repository"],
+            repository=df["repository"],
        )

    def to_frame(self):
@@ -93,14 +98,15 @@ class EntityProvenance:
                "id": pd.Series(self.id, dtype=str),
                "name": pd.Series(self.name, dtype=str),
                "description": pd.Series(self.description, dtype=str),
-                "uri": pd.Series(self.uri, dtype=str),
+                "pi": pd.Series(self.pi, dtype=str),
                "commit": pd.Series(self.commit, dtype=str),
                "git_uri": pd.Series(self.repo_uri, dtype=str),
                "executed_file": pd.Series(self.executed_file, dtype=str),
+                "main_file": pd.Series(self.main_file, dtype=str),
                "started_at": pd.Series(self.started_at, dtype=str),
                "ended_at": pd.Series(self.ended_at, dtype=str),
                "branch": pd.Series(self.branch, dtype=str),
                "type": pd.Series(self.type, dtype=str),
-                "repository": pd.Series(self.platform, dtype=str),
+                "repository": pd.Series(self.repository, dtype=str),
            }
        )
--- a/fairnb/entity/invenio_entity.py
+++ b/fairnb/entity/invenio_entity.py
@@ -3,14 +3,14 @@ from datetime import datetime
 from pathlib import Path

 from fairnb.api.dbrepo import DBRepoConnector
-from fairnb.api.invenio import InvenioManager, InvenioConnector
+from fairnb.api.invenio import InvenioRDMManager, InvenioRDMConnector
 from fairnb.entity.entity import Entity
 from fairnb.entity.entity_provenance import EntityProvenance


 @dataclass
-class InvenioEntity(Entity):
-    invenio_manager: InvenioManager = field(init=True, default=None)
+class InvenioRDMEntity(Entity):
+    invenio_manager: InvenioRDMManager = field(init=True, default=None)
    record_metadata: dict = field(init=True, default=None)
    publish_record: bool = field(init=True, default=False)
    platform: str = field(init=False, default="https://doi.org/10.17616/R31NJMYD")
@@ -24,11 +24,11 @@ class InvenioEntity(Entity):
            description: str,
            type: str,
            dbrepo_connector: DBRepoConnector,
-            invenio_connector: InvenioConnector,
+            invenio_connector: InvenioRDMConnector,
            publish_record: bool = False,
    ):
        return cls(
-            invenio_manager=InvenioManager(invenio_connector),
+            invenio_manager=InvenioRDMManager(invenio_connector),
            record_metadata=record_metadata,
            dbrepo_connector=dbrepo_connector,
            location=location,
@@ -44,13 +44,13 @@ class InvenioEntity(Entity):
            id: str,
            location: Path,
            dbrepo_connector: DBRepoConnector,
-            invenio_connector: InvenioConnector,
+            invenio_connector: InvenioRDMConnector,
    ):
        return cls(
            id=id,
            location=location,
            dbrepo_connector=dbrepo_connector,
-            invenio_manager=InvenioManager(invenio_connector)
+            invenio_manager=InvenioRDMManager(invenio_connector)
        )

    def __post_init__(self):
@@ -60,9 +60,10 @@ class InvenioEntity(Entity):
            assert self.record_metadata is not None
            return

-        self.invenio_manager.record_id = self.metadata.uri.split('/')[-1]
+        self.invenio_manager.record_id = self.metadata.pi.split('/')[-1]

-    def upload(self, executed_file: Path, dependencies: list[Entity] = None, started_at=datetime.now(), ended_at=datetime.now()):
+    def upload(self, executed_file: Path, main_file: Path,
+               dependencies: list[Entity] = None, started_at=datetime.now(), ended_at=datetime.now()):
        dir_path: Path
        regex: str

@@ -91,9 +92,10 @@ class InvenioEntity(Entity):
            name=self.name,
            description=self.description,
            executed_file=executed_file,
-            uri=uri.replace('/api', ''),
+            main_file=main_file,
+            pi=uri.replace('/api', ''),
            type=self.type,
-            platform=self.platform,
+            repository=self.platform,
            started_at=started_at,
            ended_at=ended_at,
        )

--- a/fairnb/executor.py
+++ b/fairnb/executor.py
@@ -73,6 +73,7 @@ class Executor:
            # use inspect to get path of caller
            entity.upload(
                nb_config.nb_location,
+                nb_config.main_location,
                nb_config.dependencies,
                nb_config.started_at,
                nb_config.ended_at

--- a/fairnb/nb_config.py
+++ b/fairnb/nb_config.py
@@ -8,6 +8,7 @@ from fairnb.entity.entity import Entity
 @dataclass
 class NbConfig:
    nb_location: Path
+    main_location: Path
    entities: list[Entity]
    dependencies: list[Entity]
    nb_output_location: Path = field(init=True, default=None)

--- a/fairnb/util.py
+++ b/fairnb/util.py
@@ -5,7 +5,7 @@ import pandas as pd
 import tarfile

 from fairnb.api.dbrepo import DBRepoConnector
-from fairnb.api.invenio import InvenioManager, InvenioConnector
+from fairnb.api.invenio import InvenioRDMManager, InvenioRDMConnector
 from definitions import CONFIG_PATH
 import yaml

@@ -46,14 +46,14 @@ class Util:

    def get_invenio_connector(self, path: pathlib.Path = None):
        config = self.get_config(path=path)
-        return InvenioConnector(
+        return InvenioRDMConnector(
            token=config["credentials"]["token"],
            host=config["host"]
        )

    def get_invenio_manager(self, path: pathlib.Path = None):
        config = self.get_config(path=path)
-        return InvenioManager(
+        return InvenioRDMManager(
            self.get_invenio_connector(path=path)
        )


--- a/notebooks/1_audio_files.ipynb
+++ b/notebooks/1_audio_files.ipynb
 %% Cell type:markdown id:4389a8092677254e tags:

 # Audio Files

 Bundle the provided audio files (400, in MP3) in a tar, encrypt it using gzip and store it in the output folder.

 %% Cell type:code id:87ab37c6 tags:

 ``` python
 from definitions import BASE_PATH
 import tarfile
 import zipfile
 import os
 from pathlib import Path
 ```

 %% Cell type:code id:1b4e6b01 tags:parameters

 ``` python
 # Parameters
 INPUT_PATHS = {}
 OUTPUT_PATHS = {
    "audio_tar": str(BASE_PATH / "tmp/1_audio_files/output/emotifymusic.tar.gz")
 }
 ```

-%% Cell type:code id:15dea136 tags:injected-parameters
+%% Cell type:code id:1a6df3b0 tags:injected-parameters

 ``` python
 # Parameters
 INPUT_PATHS = {}
 OUTPUT_PATHS = {
-    "audio_tar": "/home/lukas/Programming/uni/bachelorarbeit/fairnb/tmp/1_audio_files/output/emotifymusic.tar.gz"
+    "audio_tar": "/home/lukas/Programming/uni/bachelorarbeit/dbrepo-ismir/tmp/1_audio_files/output/emotifymusic.tar.gz"
 }
 ```

 %% Cell type:code id:1e487573 tags:

 ``` python
 # load provided files
 zip_path = BASE_PATH / "resource" / "1_audio_files" / "emotifymusic.zip"
 dir_path = BASE_PATH / "tmp" / "1_audio_files" / "music"

 dir_path.mkdir(parents=True, exist_ok=True)
 # unzip to dir_path
 with zipfile.ZipFile(zip_path, "r") as zfile:
    zfile.extractall(path=dir_path)
 ```

 %% Cell type:code id:c3193f35 tags:

 ``` python
 file_paths = list(dir_path.rglob('**/*.*'))
 flattened_dir_path = BASE_PATH / "tmp" / "1_audio_files" / "flattened"
 flattened_dir_path.mkdir(parents=True, exist_ok=True)

 for path in file_paths:
    (flattened_dir_path / path.relative_to(dir_path).as_posix().replace('/', '_')).write_bytes(path.read_bytes())
 ```

 %% Cell type:code id:3272ea2b tags:

 ``` python
 tar_path = Path(OUTPUT_PATHS["audio_tar"])
 tar_path.parent.mkdir(parents=True, exist_ok=True)

 with tarfile.open(tar_path, "w:gz") as file:
    file.add(flattened_dir_path, arcname=os.path.sep)
 ```

 %% Cell type:markdown id:4389a8092677254e tags:

 # Audio Files

 Bundle the provided audio files (400, in MP3) in a tar, encrypt it using gzip and store it in the output folder.

 %% Cell type:code id:87ab37c6 tags:

 ``` python
 from definitions import BASE_PATH
 import tarfile
 import zipfile
 import os
 from pathlib import Path
 ```

 %% Cell type:code id:1b4e6b01 tags:parameters

 ``` python
 # Parameters
 INPUT_PATHS = {}
 OUTPUT_PATHS = {
    "audio_tar": str(BASE_PATH / "tmp/1_audio_files/output/emotifymusic.tar.gz")
 }
 ```

-%% Cell type:code id:15dea136 tags:injected-parameters
+%% Cell type:code id:1a6df3b0 tags:injected-parameters

 ``` python
 # Parameters
 INPUT_PATHS = {}
 OUTPUT_PATHS = {
-    "audio_tar": "/home/lukas/Programming/uni/bachelorarbeit/fairnb/tmp/1_audio_files/output/emotifymusic.tar.gz"
+    "audio_tar": "/home/lukas/Programming/uni/bachelorarbeit/dbrepo-ismir/tmp/1_audio_files/output/emotifymusic.tar.gz"
 }
 ```

 %% Cell type:code id:1e487573 tags:

 ``` python
 # load provided files
 zip_path = BASE_PATH / "resource" / "1_audio_files" / "emotifymusic.zip"
 dir_path = BASE_PATH / "tmp" / "1_audio_files" / "music"

 dir_path.mkdir(parents=True, exist_ok=True)
 # unzip to dir_path
 with zipfile.ZipFile(zip_path, "r") as zfile:
    zfile.extractall(path=dir_path)
 ```

 %% Cell type:code id:c3193f35 tags:

 ``` python
 file_paths = list(dir_path.rglob('**/*.*'))
 flattened_dir_path = BASE_PATH / "tmp" / "1_audio_files" / "flattened"
 flattened_dir_path.mkdir(parents=True, exist_ok=True)

 for path in file_paths:
    (flattened_dir_path / path.relative_to(dir_path).as_posix().replace('/', '_')).write_bytes(path.read_bytes())
 ```

 %% Cell type:code id:3272ea2b tags:

 ``` python
 tar_path = Path(OUTPUT_PATHS["audio_tar"])
 tar_path.parent.mkdir(parents=True, exist_ok=True)

 with tarfile.open(tar_path, "w:gz") as file:
    file.add(flattened_dir_path, arcname=os.path.sep)
 ```

--- a/notebooks/2_generate_features.ipynb
+++ b/notebooks/2_generate_features.ipynb
 %% Cell type:markdown id:699a83ce tags:

 # Feature Extraction of Base audio files from Invenio

 %% Cell type:code id:6463a609 tags:

 ``` python
 from contextlib import contextmanager, redirect_stderr, redirect_stdout
 import pandas as pd
 import librosa
 import tarfile
 from pathlib import Path
 from concurrent.futures import ThreadPoolExecutor
 from definitions import BASE_PATH
 import os
 ```

 %% Cell type:code id:f025335b tags:parameters

 ``` python
 INPUT_PATH = BASE_PATH / "tmp" / "2_generate_features" / "input"
 OUTPUT_PATH = BASE_PATH / "tmp" / "2_generate_features" / "output"

 INPUT_PATHS: dict[str, str] = {
    "audio_tar": (INPUT_PATH / "emotifymusic.tar.gz").__str__()
 }
 OUTPUT_PATHS: dict[str, str] = {
    "raw_features": (OUTPUT_PATH / "raw_features.csv").__str__()
 }
 ```

-%% Cell type:code id:704afac7 tags:injected-parameters
+%% Cell type:code id:f640e1a8 tags:injected-parameters

 ``` python
 # Parameters
 INPUT_PATHS = {
    "audio_tar": "/home/lukas/Programming/uni/bachelorarbeit/dbrepo-ismir/tmp/2_generate_features/input/emotifymusic.tar.gz"
 }
 OUTPUT_PATHS = {
    "raw_features": "/home/lukas/Programming/uni/bachelorarbeit/dbrepo-ismir/tmp/2_generate_features/output/raw_features.csv"
 }
 ```

 %% Cell type:code id:10f1b3cd tags:

 ``` python
 # inputs

 DEFAULT_SAMPLING_RATE = 22050

 assert INPUT_PATH.exists() and INPUT_PATH.is_dir()

 with tarfile.open(audio_gz := Path(INPUT_PATHS["audio_tar"]).resolve(), "r:gz") as archive:
    archive.extractall(path=(path_out := audio_gz.with_suffix("").with_suffix("")))

 files = list(path_out.rglob("**/*.*"))
 ```

 %% Cell type:code id:469af6f9 tags:

 ``` python
 @contextmanager
 def suppress_stdout_stderr():
    """A context manager that redirects stdout and stderr to devnull"""
    with open(os.devnull, 'w') as fnull:
        with redirect_stderr(fnull) as err, redirect_stdout(fnull) as out:
            yield err, out
 ```

 %% Cell type:code id:316f6c17 tags:

 ``` python
 def generate_mfcc_feature(filepath: Path, sr: int = DEFAULT_SAMPLING_RATE, number_mfccs: int = 40):
    x, _ = load_mp3(filepath, sr=sr)
    assert sr == _
    mfcc = librosa.feature.mfcc(x, sr=sr, n_mfcc=number_mfccs)

    # transpose to use mfcc bands as columns instead of rows
    return pd.DataFrame(mfcc).transpose()

 def load_mp3(filepath: Path, sr: int = DEFAULT_SAMPLING_RATE):
    x, sr = librosa.load(filepath, sr=sr)   # extract wave (x) with sample rate (sr)
    return x, sr

 with suppress_stdout_stderr(), ThreadPoolExecutor(6) as executor:
    dataframes = list(executor.map(
        lambda args: generate_mfcc_feature(args), files)
    )
 ```

 %% Cell type:code id:acc9bae8 tags:

 ``` python
 for file, dataframe in zip(files, dataframes):
    dataframe["sample"] = dataframe.index.to_numpy(copy=True)
    dataframe["filename"] = file.name
    dataframe["label"] = file.name.split('_')[0]    # extract genre from file name

 dataframe_concat = pd.concat(dataframes)
 columns_old = list(dataframe_concat.columns)
 columns = columns_old[-3:] + columns_old[:-3]
 dataframe_concat = dataframe_concat[columns]

 output: pd.DataFrame = dataframe_concat
 output
 ```

-%% Output
-
-          sample           filename       label           0           1  \
-    0          0    classical_8.mp3   classical -513.835449    0.000000
-    1          1    classical_8.mp3   classical -430.772858   99.951447
-    2          2    classical_8.mp3   classical -312.093567  159.784668
-    3          3    classical_8.mp3   classical -243.798019  168.200287
-    4          4    classical_8.mp3   classical -250.946625  182.020203
-    ...      ...                ...         ...         ...         ...
-    2581    2581  electronic_28.mp3  electronic   -4.531759   85.749336
-    2582    2582  electronic_28.mp3  electronic  -21.892481   64.973923
-    2583    2583  electronic_28.mp3  electronic  -26.937489   59.654442
-    2584    2584  electronic_28.mp3  electronic  -37.675701   69.980713
-    2585    2585  electronic_28.mp3  electronic  -69.959473   90.579102
-    
-                  2          3          4          5          6  ...        30  \
-    0      0.000000   0.000000   0.000000   0.000000   0.000000  ...  0.000000
-    1     61.102493  28.070032  15.340330  15.008282  11.502503  ... -4.017534
-    2     31.906086  25.901234   6.815042   3.911939  21.410465  ...  3.267372
-    3     16.092997  34.248627   3.439126   4.217156  16.333824  ...  8.645699
-    4     12.093463  31.393484  10.792284   5.874646  15.635584  ...  6.143005
-    ...         ...        ...        ...        ...        ...  ...       ...
-    2581   3.175902  29.282883  10.520454  28.353235   7.040113  ... -0.076582
-    2582   0.638062  30.259424   3.547897  25.982525  12.492319  ... -4.140548
-    2583   3.198796  36.822197  -0.308186  17.223629  12.519827  ... -2.150106
-    2584   6.486831  36.693054  -2.817516  14.450989   9.200117  ...  0.592433
-    2585  12.684738  39.559166  -2.489999  13.447134   2.889965  ...  2.153978
-    
-                 31        32        33         34         35         36  \
-    0      0.000000  0.000000  0.000000   0.000000   0.000000   0.000000
-    1     -2.689229 -2.293572 -2.991963  -3.644343  -4.003089  -4.528318
-    2     -2.944059 -7.677339 -3.628831  -4.110184 -14.840838  -3.495162
-    3     -5.766571 -5.486410 -3.288999  -3.853479 -19.015926  -7.971353
-    4     -2.007963 -7.107271 -5.137182  -7.456434 -19.914568  -8.567856
-    ...         ...       ...       ...        ...        ...        ...
-    2581  10.373774 -3.842222  1.740638  -4.820115   5.424960  -0.350912
-    2582   8.154976 -8.581367  0.991196  -7.903484   5.064352  -7.015607
-    2583   6.751756 -8.335445 -3.181783 -11.748012   3.223699 -10.738268
-    2584   4.523458 -8.737437 -4.725236  -7.613096   1.976833  -9.998651
-    2585   6.035127 -8.183851 -0.212283  -1.487655  -2.779953  -5.455588
-    
-                37        38         39
-    0     0.000000  0.000000   0.000000
-    1    -4.626081 -2.798346   0.923011
-    2     8.776964 -4.981813 -10.156776
-    3     9.408128 -3.466177 -11.191519
-    4     4.395530 -5.535549  -9.764086
-    ...        ...       ...        ...
-    2581  3.484543  4.927905   7.667750
-    2582  2.761323  2.499545   4.854020
-    2583 -1.915628 -2.164130  -0.500030
-    2584 -1.651334 -1.831298  -1.857335
-    2585  0.809570 -1.209018  -1.631956
-    
-    [1029854 rows x 43 columns]
-
 %% Cell type:code id:0abf745b tags:

 ``` python
 # outputs
 OUTPUT_PATH.mkdir(parents=True, exist_ok=True)
 output.to_csv(OUTPUT_PATHS["raw_features"], index=False)
 ```

 %% Cell type:markdown id:699a83ce tags:

 # Feature Extraction of Base audio files from Invenio

 %% Cell type:code id:6463a609 tags:

 ``` python
 from contextlib import contextmanager, redirect_stderr, redirect_stdout
 import pandas as pd
 import librosa
 import tarfile
 from pathlib import Path
 from concurrent.futures import ThreadPoolExecutor
 from definitions import BASE_PATH
 import os
 ```

 %% Cell type:code id:f025335b tags:parameters

 ``` python
 INPUT_PATH = BASE_PATH / "tmp" / "2_generate_features" / "input"
 OUTPUT_PATH = BASE_PATH / "tmp" / "2_generate_features" / "output"

 INPUT_PATHS: dict[str, str] = {
    "audio_tar": (INPUT_PATH / "emotifymusic.tar.gz").__str__()
 }
 OUTPUT_PATHS: dict[str, str] = {
    "raw_features": (OUTPUT_PATH / "raw_features.csv").__str__()
 }
 ```

-%% Cell type:code id:704afac7 tags:injected-parameters
+%% Cell type:code id:f640e1a8 tags:injected-parameters

 ``` python
 # Parameters
 INPUT_PATHS = {
    "audio_tar": "/home/lukas/Programming/uni/bachelorarbeit/dbrepo-ismir/tmp/2_generate_features/input/emotifymusic.tar.gz"
 }
 OUTPUT_PATHS = {
    "raw_features": "/home/lukas/Programming/uni/bachelorarbeit/dbrepo-ismir/tmp/2_generate_features/output/raw_features.csv"
 }
 ```

 %% Cell type:code id:10f1b3cd tags:

 ``` python
 # inputs

 DEFAULT_SAMPLING_RATE = 22050

 assert INPUT_PATH.exists() and INPUT_PATH.is_dir()

 with tarfile.open(audio_gz := Path(INPUT_PATHS["audio_tar"]).resolve(), "r:gz") as archive:
    archive.extractall(path=(path_out := audio_gz.with_suffix("").with_suffix("")))

 files = list(path_out.rglob("**/*.*"))
 ```

 %% Cell type:code id:469af6f9 tags:

 ``` python
 @contextmanager
 def suppress_stdout_stderr():
    """A context manager that redirects stdout and stderr to devnull"""
    with open(os.devnull, 'w') as fnull:
        with redirect_stderr(fnull) as err, redirect_stdout(fnull) as out:
            yield err, out
 ```

 %% Cell type:code id:316f6c17 tags:

 ``` python
 def generate_mfcc_feature(filepath: Path, sr: int = DEFAULT_SAMPLING_RATE, number_mfccs: int = 40):
    x, _ = load_mp3(filepath, sr=sr)
    assert sr == _
    mfcc = librosa.feature.mfcc(x, sr=sr, n_mfcc=number_mfccs)

    # transpose to use mfcc bands as columns instead of rows
    return pd.DataFrame(mfcc).transpose()

 def load_mp3(filepath: Path, sr: int = DEFAULT_SAMPLING_RATE):
    x, sr = librosa.load(filepath, sr=sr)   # extract wave (x) with sample rate (sr)
    return x, sr

 with suppress_stdout_stderr(), ThreadPoolExecutor(6) as executor:
    dataframes = list(executor.map(
        lambda args: generate_mfcc_feature(args), files)
    )
 ```

 %% Cell type:code id:acc9bae8 tags:

 ``` python
 for file, dataframe in zip(files, dataframes):
    dataframe["sample"] = dataframe.index.to_numpy(copy=True)
    dataframe["filename"] = file.name
    dataframe["label"] = file.name.split('_')[0]    # extract genre from file name

 dataframe_concat = pd.concat(dataframes)
 columns_old = list(dataframe_concat.columns)
 columns = columns_old[-3:] + columns_old[:-3]
 dataframe_concat = dataframe_concat[columns]

 output: pd.DataFrame = dataframe_concat
 output
 ```

-%% Output
-
-          sample           filename       label           0           1  \
-    0          0    classical_8.mp3   classical -513.835449    0.000000
-    1          1    classical_8.mp3   classical -430.772858   99.951447
-    2          2    classical_8.mp3   classical -312.093567  159.784668
-    3          3    classical_8.mp3   classical -243.798019  168.200287
-    4          4    classical_8.mp3   classical -250.946625  182.020203
-    ...      ...                ...         ...         ...         ...
-    2581    2581  electronic_28.mp3  electronic   -4.531759   85.749336
-    2582    2582  electronic_28.mp3  electronic  -21.892481   64.973923
-    2583    2583  electronic_28.mp3  electronic  -26.937489   59.654442
-    2584    2584  electronic_28.mp3  electronic  -37.675701   69.980713
-    2585    2585  electronic_28.mp3  electronic  -69.959473   90.579102
-    
-                  2          3          4          5          6  ...        30  \
-    0      0.000000   0.000000   0.000000   0.000000   0.000000  ...  0.000000
-    1     61.102493  28.070032  15.340330  15.008282  11.502503  ... -4.017534
-    2     31.906086  25.901234   6.815042   3.911939  21.410465  ...  3.267372
-    3     16.092997  34.248627   3.439126   4.217156  16.333824  ...  8.645699
-    4     12.093463  31.393484  10.792284   5.874646  15.635584  ...  6.143005
-    ...         ...        ...        ...        ...        ...  ...       ...
-    2581   3.175902  29.282883  10.520454  28.353235   7.040113  ... -0.076582
-    2582   0.638062  30.259424   3.547897  25.982525  12.492319  ... -4.140548
-    2583   3.198796  36.822197  -0.308186  17.223629  12.519827  ... -2.150106
-    2584   6.486831  36.693054  -2.817516  14.450989   9.200117  ...  0.592433
-    2585  12.684738  39.559166  -2.489999  13.447134   2.889965  ...  2.153978
-    
-                 31        32        33         34         35         36  \
-    0      0.000000  0.000000  0.000000   0.000000   0.000000   0.000000
-    1     -2.689229 -2.293572 -2.991963  -3.644343  -4.003089  -4.528318
-    2     -2.944059 -7.677339 -3.628831  -4.110184 -14.840838  -3.495162
-    3     -5.766571 -5.486410 -3.288999  -3.853479 -19.015926  -7.971353
-    4     -2.007963 -7.107271 -5.137182  -7.456434 -19.914568  -8.567856
-    ...         ...       ...       ...        ...        ...        ...
-    2581  10.373774 -3.842222  1.740638  -4.820115   5.424960  -0.350912
-    2582   8.154976 -8.581367  0.991196  -7.903484   5.064352  -7.015607
-    2583   6.751756 -8.335445 -3.181783 -11.748012   3.223699 -10.738268
-    2584   4.523458 -8.737437 -4.725236  -7.613096   1.976833  -9.998651
-    2585   6.035127 -8.183851 -0.212283  -1.487655  -2.779953  -5.455588
-    
-                37        38         39
-    0     0.000000  0.000000   0.000000
-    1    -4.626081 -2.798346   0.923011
-    2     8.776964 -4.981813 -10.156776
-    3     9.408128 -3.466177 -11.191519
-    4     4.395530 -5.535549  -9.764086
-    ...        ...       ...        ...
-    2581  3.484543  4.927905   7.667750
-    2582  2.761323  2.499545   4.854020
-    2583 -1.915628 -2.164130  -0.500030
-    2584 -1.651334 -1.831298  -1.857335
-    2585  0.809570 -1.209018  -1.631956
-    
-    [1029854 rows x 43 columns]
-
 %% Cell type:code id:0abf745b tags:

 ``` python
 # outputs
 OUTPUT_PATH.mkdir(parents=True, exist_ok=True)
 output.to_csv(OUTPUT_PATHS["raw_features"], index=False)
 ```

--- a/notebooks/3_aggregate_features.ipynb
+++ b/notebooks/3_aggregate_features.ipynb
 %% Cell type:markdown id:f48a4573 tags:

 # Aggregate MFCC Features

 Aggregate from n rows par file to 1 (calculate min, max, etc. for each feature).

 %% Cell type:code id:389576b8 tags:

 ``` python
 from pathlib import Path

 import pandas as pd
 from definitions import BASE_PATH
 ```

 %% Cell type:code id:26f640e0 tags:parameters

 ``` python
 INPUT_PATH = BASE_PATH / "tmp" / "3_aggregate_features" / "input"
 OUTPUT_PATH = BASE_PATH / "tmp" / "3_aggregate_features" / "output"

 INPUT_PATHS: dict[str, str] = {
    "raw_features": (INPUT_PATH / "raw_features.csv").__str__()
 }

 OUTPUT_PATHS: dict[str, str] = {
    "features": (OUTPUT_PATH / "features.csv").__str__()
 }
 ```

-%% Cell type:code id:88ecee07 tags:injected-parameters
+%% Cell type:code id:40dbf7fa tags:injected-parameters

 ``` python
 # Parameters
 INPUT_PATHS = {
-    "raw_features": "/home/lukas/Programming/uni/bachelorarbeit/fairnb/tmp/3_aggregate_features/input/raw_features.csv"
+    "raw_features": "/home/lukas/Programming/uni/bachelorarbeit/dbrepo-ismir/tmp/3_aggregate_features/input/raw_features.csv"
 }
 OUTPUT_PATHS = {
-    "aggregated_features": "/home/lukas/Programming/uni/bachelorarbeit/fairnb/tmp/3_aggregate_features/output/features.csv"
+    "aggregated_features": "/home/lukas/Programming/uni/bachelorarbeit/dbrepo-ismir/tmp/3_aggregate_features/output/features.csv"
 }
 ```

 %% Cell type:code id:c5d9d980 tags:

 ``` python
 # inputs
 raw_features = pd.read_csv(INPUT_PATHS["raw_features"], index_col=False)
 ```

 %% Cell type:code id:99f75f47 tags:

 ``` python
 meta_columns = ["sample", "filename", "label"]
 mfcc_aggregated = raw_features\
    .drop(meta_columns, axis=1, errors='ignore')\
    .groupby(raw_features.filename).agg(['min', 'max', 'mean', 'std', 'skew'])

 mfcc_meta = pd.DataFrame(raw_features['label'].groupby(raw_features.filename).last())
 mfcc_meta.columns = pd.MultiIndex.from_arrays([['label'], ['']])    # needed for merge
 mfcc_merged = pd.merge(mfcc_meta, mfcc_aggregated, left_index=True, right_index=True)

 # reduce multi index to single index
 one_level_cols = ['_'.join([str(el) for el in col]) for col in mfcc_merged.columns[1:]]
 one_level_cols.insert(0, "label")

 mfcc_merged.columns = pd.Index(one_level_cols)
 mfcc_merged = mfcc_merged.reset_index()
 mfcc_merged
 ```

-%% Output
-
-                  filename      label      0_min       0_max      0_mean  \
-    0      classical_1.mp3  classical -530.78436 -163.308350 -302.203167
-    1     classical_10.mp3  classical -562.85785  -96.164795 -219.259016
-    2    classical_100.mp3  classical -536.23737  -61.608826 -177.804114
-    3     classical_11.mp3  classical -536.45746 -120.429665 -222.126303
-    4     classical_12.mp3  classical -562.67523 -148.133560 -270.975406
-    ..                 ...        ...        ...         ...         ...
-    395        rock_95.mp3       rock -553.11010   -5.218835 -193.506047
-    396        rock_96.mp3       rock -541.23600   27.163334 -119.113996
-    397        rock_97.mp3       rock -518.49500   58.526745  -66.267744
-    398        rock_98.mp3       rock -518.64307   53.555115  -45.734517
-    399        rock_99.mp3       rock -544.70310   75.612130  -49.380943
-    
-             0_std    0_skew      1_min      1_max      1_mean  ...     38_min  \
-    0    51.142183 -0.468374   0.000000  178.75162  111.332342  ... -44.098070
-    1    53.561838 -0.772320   0.029056  259.63270  215.094182  ... -27.458416
-    2    83.381622 -2.587179   0.000000  190.47589  112.471713  ... -27.335688
-    3    76.246992 -2.402418   0.000000  159.42575   99.853645  ... -31.774948
-    4    52.191182 -0.366586   0.000000  194.26416  148.226647  ... -44.843810
-    ..         ...       ...        ...        ...         ...  ...        ...
-    395  76.869437 -0.201055 -89.948746  201.18045  111.724191  ... -27.043941
-    396  58.420684 -0.957699  -7.415961  210.49246  125.453699  ... -37.584858
-    397  65.635619 -0.898026 -58.824410  175.20135   99.288265  ... -29.620445
-    398  52.444200 -1.705641   0.000000  187.04274   96.440874  ... -26.967848
-    399  54.045627 -0.863093 -32.930653  191.73538   93.971242  ... -21.929403
-    
-            38_max   38_mean     38_std   38_skew     39_min     39_max   39_mean  \
-    0    47.308060 -3.713503  16.553984  0.230691 -46.794480  49.352516 -2.282116
-    1    29.811110  0.484271   8.660648 -0.479016 -28.989983  27.533710  0.952658
-    2    27.610388 -0.333233   8.185075  0.208425 -38.095375  31.397880 -1.494916
-    3    31.500881 -3.781627   9.191043  0.260886 -22.667440  50.992897  1.600777
-    4    28.490644 -6.242015  10.546545  0.341848 -25.040888  46.878204  1.844494
-    ..         ...       ...        ...       ...        ...        ...       ...
-    395  22.451445 -7.234634   8.471853  0.753855 -24.712723  23.410387 -4.502398
-    396  28.087936 -9.704238   8.447620  0.112760 -38.147890  21.814402 -8.249507
-    397  26.325895 -5.722825   7.727378  0.207489 -29.497524  25.410654 -3.356614
-    398   8.714737 -9.511491   5.551820 -0.025604 -23.020084  13.948638 -2.664985
-    399  17.050608 -5.296691   5.894963  0.390705 -20.983192  29.312023 -0.321836
-    
-            39_std   39_skew
-    0    15.285639  0.171462
-    1    10.477735 -0.185771
-    2    10.917299  0.020985
-    3    10.125545  0.595763
-    4    11.160392  0.503120
-    ..         ...       ...
-    395   6.687984  0.238807
-    396   7.807756  0.071968
-    397   8.170526  0.160330
-    398   5.051498 -0.258407
-    399   6.571660  0.384794
-    
-    [400 rows x 202 columns]
-
 %% Cell type:code id:4ac5c765 tags:

 ``` python
 # outputs
 aggregated_features_path = Path(OUTPUT_PATHS["aggregated_features"]).resolve()
 aggregated_features_path.parent.mkdir(parents=True, exist_ok=True)

 output = mfcc_merged
 output.to_csv(aggregated_features_path, index=False)
 ```

 %% Cell type:markdown id:f48a4573 tags:

 # Aggregate MFCC Features

 Aggregate from n rows par file to 1 (calculate min, max, etc. for each feature).

 %% Cell type:code id:389576b8 tags:

 ``` python
 from pathlib import Path

 import pandas as pd
 from definitions import BASE_PATH
 ```

 %% Cell type:code id:26f640e0 tags:parameters

 ``` python
 INPUT_PATH = BASE_PATH / "tmp" / "3_aggregate_features" / "input"
 OUTPUT_PATH = BASE_PATH / "tmp" / "3_aggregate_features" / "output"

 INPUT_PATHS: dict[str, str] = {
    "raw_features": (INPUT_PATH / "raw_features.csv").__str__()
 }

 OUTPUT_PATHS: dict[str, str] = {
    "features": (OUTPUT_PATH / "features.csv").__str__()
 }
 ```

-%% Cell type:code id:88ecee07 tags:injected-parameters
+%% Cell type:code id:40dbf7fa tags:injected-parameters

 ``` python
 # Parameters
 INPUT_PATHS = {
-    "raw_features": "/home/lukas/Programming/uni/bachelorarbeit/fairnb/tmp/3_aggregate_features/input/raw_features.csv"
+    "raw_features": "/home/lukas/Programming/uni/bachelorarbeit/dbrepo-ismir/tmp/3_aggregate_features/input/raw_features.csv"
 }
 OUTPUT_PATHS = {
-    "aggregated_features": "/home/lukas/Programming/uni/bachelorarbeit/fairnb/tmp/3_aggregate_features/output/features.csv"
+    "aggregated_features": "/home/lukas/Programming/uni/bachelorarbeit/dbrepo-ismir/tmp/3_aggregate_features/output/features.csv"
 }
 ```

 %% Cell type:code id:c5d9d980 tags:

 ``` python
 # inputs
 raw_features = pd.read_csv(INPUT_PATHS["raw_features"], index_col=False)
 ```

 %% Cell type:code id:99f75f47 tags:

 ``` python
 meta_columns = ["sample", "filename", "label"]
 mfcc_aggregated = raw_features\
    .drop(meta_columns, axis=1, errors='ignore')\
    .groupby(raw_features.filename).agg(['min', 'max', 'mean', 'std', 'skew'])

 mfcc_meta = pd.DataFrame(raw_features['label'].groupby(raw_features.filename).last())
 mfcc_meta.columns = pd.MultiIndex.from_arrays([['label'], ['']])    # needed for merge
 mfcc_merged = pd.merge(mfcc_meta, mfcc_aggregated, left_index=True, right_index=True)

 # reduce multi index to single index
 one_level_cols = ['_'.join([str(el) for el in col]) for col in mfcc_merged.columns[1:]]
 one_level_cols.insert(0, "label")

 mfcc_merged.columns = pd.Index(one_level_cols)
 mfcc_merged = mfcc_merged.reset_index()
 mfcc_merged
 ```

-%% Output
-
-                  filename      label      0_min       0_max      0_mean  \
-    0      classical_1.mp3  classical -530.78436 -163.308350 -302.203167
-    1     classical_10.mp3  classical -562.85785  -96.164795 -219.259016
-    2    classical_100.mp3  classical -536.23737  -61.608826 -177.804114
-    3     classical_11.mp3  classical -536.45746 -120.429665 -222.126303
-    4     classical_12.mp3  classical -562.67523 -148.133560 -270.975406
-    ..                 ...        ...        ...         ...         ...
-    395        rock_95.mp3       rock -553.11010   -5.218835 -193.506047
-    396        rock_96.mp3       rock -541.23600   27.163334 -119.113996
-    397        rock_97.mp3       rock -518.49500   58.526745  -66.267744
-    398        rock_98.mp3       rock -518.64307   53.555115  -45.734517
-    399        rock_99.mp3       rock -544.70310   75.612130  -49.380943
-    
-             0_std    0_skew      1_min      1_max      1_mean  ...     38_min  \
-    0    51.142183 -0.468374   0.000000  178.75162  111.332342  ... -44.098070
-    1    53.561838 -0.772320   0.029056  259.63270  215.094182  ... -27.458416
-    2    83.381622 -2.587179   0.000000  190.47589  112.471713  ... -27.335688
-    3    76.246992 -2.402418   0.000000  159.42575   99.853645  ... -31.774948
-    4    52.191182 -0.366586   0.000000  194.26416  148.226647  ... -44.843810
-    ..         ...       ...        ...        ...         ...  ...        ...
-    395  76.869437 -0.201055 -89.948746  201.18045  111.724191  ... -27.043941
-    396  58.420684 -0.957699  -7.415961  210.49246  125.453699  ... -37.584858
-    397  65.635619 -0.898026 -58.824410  175.20135   99.288265  ... -29.620445
-    398  52.444200 -1.705641   0.000000  187.04274   96.440874  ... -26.967848
-    399  54.045627 -0.863093 -32.930653  191.73538   93.971242  ... -21.929403
-    
-            38_max   38_mean     38_std   38_skew     39_min     39_max   39_mean  \
-    0    47.308060 -3.713503  16.553984  0.230691 -46.794480  49.352516 -2.282116
-    1    29.811110  0.484271   8.660648 -0.479016 -28.989983  27.533710  0.952658
-    2    27.610388 -0.333233   8.185075  0.208425 -38.095375  31.397880 -1.494916
-    3    31.500881 -3.781627   9.191043  0.260886 -22.667440  50.992897  1.600777
-    4    28.490644 -6.242015  10.546545  0.341848 -25.040888  46.878204  1.844494
-    ..         ...       ...        ...       ...        ...        ...       ...
-    395  22.451445 -7.234634   8.471853  0.753855 -24.712723  23.410387 -4.502398
-    396  28.087936 -9.704238   8.447620  0.112760 -38.147890  21.814402 -8.249507
-    397  26.325895 -5.722825   7.727378  0.207489 -29.497524  25.410654 -3.356614
-    398   8.714737 -9.511491   5.551820 -0.025604 -23.020084  13.948638 -2.664985
-    399  17.050608 -5.296691   5.894963  0.390705 -20.983192  29.312023 -0.321836
-    
-            39_std   39_skew
-    0    15.285639  0.171462
-    1    10.477735 -0.185771
-    2    10.917299  0.020985
-    3    10.125545  0.595763
-    4    11.160392  0.503120
-    ..         ...       ...
-    395   6.687984  0.238807
-    396   7.807756  0.071968
-    397   8.170526  0.160330
-    398   5.051498 -0.258407
-    399   6.571660  0.384794
-    
-    [400 rows x 202 columns]
-
 %% Cell type:code id:4ac5c765 tags:

 ``` python
 # outputs
 aggregated_features_path = Path(OUTPUT_PATHS["aggregated_features"]).resolve()
 aggregated_features_path.parent.mkdir(parents=True, exist_ok=True)

 output = mfcc_merged
 output.to_csv(aggregated_features_path, index=False)
 ```

--- a/notebooks/4_split.ipynb
+++ b/notebooks/4_split.ipynb
 %% Cell type:markdown id:e92b4fe9 tags:

 # Split the Features into Train and Test Set

 %% Cell type:code id:5f1fae44 tags:

 ``` python
 import pandas as pd
 from pathlib import Path
 from definitions import BASE_PATH
 ```

 %% Cell type:code id:01de1b27 tags:parameters

 ``` python
 # Tagged with 'parameters'
 from definitions import BASE_PATH

 INPUT_PATHS: dict[str, str] = {
    "features": (BASE_PATH / "tmp" / "4_split" / "input" / "features.csv").__str__()
 }
 OUTPUT_PATHS: dict[str, str] = {
    "split": (BASE_PATH / "tmp" / "4_split" / "output" / "split.csv").__str__()
 }
 ```

-%% Cell type:code id:e99ca0ba tags:injected-parameters
+%% Cell type:code id:fdc0a0a6 tags:injected-parameters

 ``` python
 # Parameters
 INPUT_PATHS = {
-    "aggregated_features": "/home/lukas/Programming/uni/bachelorarbeit/fairnb/tmp/4_split/input/features.csv"
+    "aggregated_features": "/home/lukas/Programming/uni/bachelorarbeit/dbrepo-ismir/tmp/4_split/input/features.csv"
 }
 OUTPUT_PATHS = {
-    "split": "/home/lukas/Programming/uni/bachelorarbeit/fairnb/tmp/4_split/output/split.csv"
+    "split": "/home/lukas/Programming/uni/bachelorarbeit/dbrepo-ismir/tmp/4_split/output/split.csv"
 }
 ```

 %% Cell type:code id:a4cc6800 tags:

 ``` python
 # INPUT

 for path in INPUT_PATHS.values():
    assert Path(path).exists()

 features = pd.read_csv(INPUT_PATHS["aggregated_features"])
 ```

 %% Cell type:code id:a186d0c4 tags:

 ``` python
-train = features.sample(frac=0.8).sort_index()
+train = features.sample(frac=0.8, random_state=11908553).sort_index()
 test = features.drop(train.index)

 split_true = pd.DataFrame({
    "filename": train.filename,
    "train": True
 })
 split_false = pd.DataFrame({
    "filename": test.filename,
    "train": False
 })

 split_concat = pd.concat([split_true, split_false])\
    .sort_values("filename")\
    .reset_index(drop=True)
 ```

 %% Cell type:code id:091e0641 tags:

 ``` python
 split_concat
 ```

-%% Output
-
-                  filename  train
-    0      classical_1.mp3  False
-    1     classical_10.mp3   True
-    2    classical_100.mp3  False
-    3     classical_11.mp3   True
-    4     classical_12.mp3   True
-    ..                 ...    ...
-    395        rock_95.mp3  False
-    396        rock_96.mp3   True
-    397        rock_97.mp3   True
-    398        rock_98.mp3   True
-    399        rock_99.mp3   True
-    
-    [400 rows x 2 columns]
-
 %% Cell type:code id:7b11b8bb tags:

 ``` python
 # output
 OUTPUT_PATH = Path(OUTPUT_PATHS["split"])
 OUTPUT_PATH.parent.mkdir(parents=True, exist_ok=True)

 output = split_concat
 output.to_csv(OUTPUT_PATH, index=False)
 ```

 %% Cell type:markdown id:e92b4fe9 tags:

 # Split the Features into Train and Test Set

 %% Cell type:code id:5f1fae44 tags:

 ``` python
 import pandas as pd
 from pathlib import Path
 from definitions import BASE_PATH
 ```

 %% Cell type:code id:01de1b27 tags:parameters

 ``` python
 # Tagged with 'parameters'
 from definitions import BASE_PATH

 INPUT_PATHS: dict[str, str] = {
    "features": (BASE_PATH / "tmp" / "4_split" / "input" / "features.csv").__str__()
 }
 OUTPUT_PATHS: dict[str, str] = {
    "split": (BASE_PATH / "tmp" / "4_split" / "output" / "split.csv").__str__()
 }
 ```

-%% Cell type:code id:e99ca0ba tags:injected-parameters
+%% Cell type:code id:fdc0a0a6 tags:injected-parameters

 ``` python
 # Parameters
 INPUT_PATHS = {
-    "aggregated_features": "/home/lukas/Programming/uni/bachelorarbeit/fairnb/tmp/4_split/input/features.csv"
+    "aggregated_features": "/home/lukas/Programming/uni/bachelorarbeit/dbrepo-ismir/tmp/4_split/input/features.csv"
 }
 OUTPUT_PATHS = {
-    "split": "/home/lukas/Programming/uni/bachelorarbeit/fairnb/tmp/4_split/output/split.csv"
+    "split": "/home/lukas/Programming/uni/bachelorarbeit/dbrepo-ismir/tmp/4_split/output/split.csv"
 }
 ```

 %% Cell type:code id:a4cc6800 tags:

 ``` python
 # INPUT

 for path in INPUT_PATHS.values():
    assert Path(path).exists()

 features = pd.read_csv(INPUT_PATHS["aggregated_features"])
 ```

 %% Cell type:code id:a186d0c4 tags:

 ``` python
-train = features.sample(frac=0.8).sort_index()
+train = features.sample(frac=0.8, random_state=11908553).sort_index()
 test = features.drop(train.index)

 split_true = pd.DataFrame({
    "filename": train.filename,
    "train": True
 })
 split_false = pd.DataFrame({
    "filename": test.filename,
    "train": False
 })

 split_concat = pd.concat([split_true, split_false])\
    .sort_values("filename")\
    .reset_index(drop=True)
 ```

 %% Cell type:code id:091e0641 tags:

 ``` python
 split_concat
 ```

-%% Output
-
-                  filename  train
-    0      classical_1.mp3  False
-    1     classical_10.mp3   True
-    2    classical_100.mp3  False
-    3     classical_11.mp3   True
-    4     classical_12.mp3   True
-    ..                 ...    ...
-    395        rock_95.mp3  False
-    396        rock_96.mp3   True
-    397        rock_97.mp3   True
-    398        rock_98.mp3   True
-    399        rock_99.mp3   True
-    
-    [400 rows x 2 columns]
-
 %% Cell type:code id:7b11b8bb tags:

 ``` python
 # output
 OUTPUT_PATH = Path(OUTPUT_PATHS["split"])
 OUTPUT_PATH.parent.mkdir(parents=True, exist_ok=True)

 output = split_concat
 output.to_csv(OUTPUT_PATH, index=False)
 ```

--- a/notebooks/5_ml_model.ipynb
+++ b/notebooks/5_ml_model.ipynb
 %% Cell type:markdown id:5de30442 tags:

 # ML Experiment code

 # Inputs: splits & aggregated features

 %% Cell type:code id:a2eb8998 tags:

 ``` python
 import pickle
+from pathlib import Path

 import numpy as np
 import pandas as pd
 from pandas import DataFrame, Index
 from sklearn.decomposition import PCA
-from sklearn.metrics import accuracy_score
+from sklearn.metrics import accuracy_score, confusion_matrix
 from sklearn.model_selection import train_test_split, GridSearchCV
 from sklearn.preprocessing import StandardScaler
 from sklearn.svm import SVC
+import seaborn as sns
+import matplotlib.pyplot as plt

 from definitions import BASE_PATH
 ```

 %% Cell type:code id:8a8da20f tags:parameters

 ``` python
 # Tagged with 'parameters'
 INPUT_PATH = BASE_PATH / "tmp" / "5_ml_model" / "input"
 OUTPUT_PATH = BASE_PATH / "tmp" / "5_ml_model" / "output"

 INPUT_PATHS: dict[str, str] = {
    "split": (INPUT_PATH / "split.csv").__str__(),
    "features": (INPUT_PATH / "features.csv").__str__()
 }
 OUTPUT_PATHS: dict[str, str] = {
    "submission": (OUTPUT_PATH / "submission.csv").__str__(),
    "clf": (OUTPUT_PATH / "clf.pickle").__str__()
 }
 ```

-%% Cell type:code id:08b56684 tags:injected-parameters
+%% Cell type:code id:1229e75d tags:injected-parameters

 ``` python
 # Parameters
 INPUT_PATHS = {
-    "split": "/home/lukas/Programming/uni/bachelorarbeit/fairnb/tmp/5_ml_model/input/split.csv",
-    "aggregated_features": "/home/lukas/Programming/uni/bachelorarbeit/fairnb/tmp/5_ml_model/input/features.csv",
+    "split": "/home/lukas/Programming/uni/bachelorarbeit/dbrepo-ismir/tmp/5_ml_model/input/split.csv",
+    "aggregated_features": "/home/lukas/Programming/uni/bachelorarbeit/dbrepo-ismir/tmp/5_ml_model/input/features.csv",
 }
 OUTPUT_PATHS = {
-    "clf": "/home/lukas/Programming/uni/bachelorarbeit/fairnb/tmp/5_ml_model/output/ml_model.pickle",
-    "submission": "/home/lukas/Programming/uni/bachelorarbeit/fairnb/tmp/5_ml_model/output/test_result.csv",
+    "clf": "/home/lukas/Programming/uni/bachelorarbeit/dbrepo-ismir/tmp/5_ml_model/output/ml_model.pickle",
+    "submission": "/home/lukas/Programming/uni/bachelorarbeit/dbrepo-ismir/tmp/5_ml_model/output/test_result.csv",
 }
 ```

 %% Cell type:code id:6810272a tags:

 ``` python
 # input
 split: pd.DataFrame = pd.read_csv(INPUT_PATHS["split"])
 features: pd.DataFrame = pd.read_csv(INPUT_PATHS["aggregated_features"])
 ```

 %% Cell type:code id:36f06fd6 tags:

 ``` python
 joined = pd.merge(features, split, on="filename").set_index("filename")
 joined
 ```

-%% Output
-
-                           label      0_min       0_max      0_mean      0_std  \
-    filename
-    classical_1.mp3    classical -530.78436 -163.308350 -302.203167  51.142183
-    classical_10.mp3   classical -562.85785  -96.164795 -219.259016  53.561838
-    classical_100.mp3  classical -536.23737  -61.608826 -177.804114  83.381622
-    classical_11.mp3   classical -536.45746 -120.429665 -222.126303  76.246992
-    classical_12.mp3   classical -562.67523 -148.133560 -270.975406  52.191182
-    ...                      ...        ...         ...         ...        ...
-    rock_95.mp3             rock -553.11010   -5.218835 -193.506047  76.869437
-    rock_96.mp3             rock -541.23600   27.163334 -119.113996  58.420684
-    rock_97.mp3             rock -518.49500   58.526745  -66.267744  65.635619
-    rock_98.mp3             rock -518.64307   53.555115  -45.734517  52.444200
-    rock_99.mp3             rock -544.70310   75.612130  -49.380943  54.045627
-    
-                         0_skew      1_min      1_max      1_mean      1_std  ...  \
-    filename                                                                  ...
-    classical_1.mp3   -0.468374   0.000000  178.75162  111.332342  24.847563  ...
-    classical_10.mp3  -0.772320   0.029056  259.63270  215.094182  18.388131  ...
-    classical_100.mp3 -2.587179   0.000000  190.47589  112.471713  27.277553  ...
-    classical_11.mp3  -2.402418   0.000000  159.42575   99.853645  21.916949  ...
-    classical_12.mp3  -0.366586   0.000000  194.26416  148.226647  19.305008  ...
-    ...                     ...        ...        ...         ...        ...  ...
-    rock_95.mp3       -0.201055 -89.948746  201.18045  111.724191  36.463584  ...
-    rock_96.mp3       -0.957699  -7.415961  210.49246  125.453699  31.908869  ...
-    rock_97.mp3       -0.898026 -58.824410  175.20135   99.288265  25.158416  ...
-    rock_98.mp3       -1.705641   0.000000  187.04274   96.440874  24.137702  ...
-    rock_99.mp3       -0.863093 -32.930653  191.73538   93.971242  33.410220  ...
-    
-                          38_max   38_mean     38_std   38_skew     39_min  \
-    filename
-    classical_1.mp3    47.308060 -3.713503  16.553984  0.230691 -46.794480
-    classical_10.mp3   29.811110  0.484271   8.660648 -0.479016 -28.989983
-    classical_100.mp3  27.610388 -0.333233   8.185075  0.208425 -38.095375
-    classical_11.mp3   31.500881 -3.781627   9.191043  0.260886 -22.667440
-    classical_12.mp3   28.490644 -6.242015  10.546545  0.341848 -25.040888
-    ...                      ...       ...        ...       ...        ...
-    rock_95.mp3        22.451445 -7.234634   8.471853  0.753855 -24.712723
-    rock_96.mp3        28.087936 -9.704238   8.447620  0.112760 -38.147890
-    rock_97.mp3        26.325895 -5.722825   7.727378  0.207489 -29.497524
-    rock_98.mp3         8.714737 -9.511491   5.551820 -0.025604 -23.020084
-    rock_99.mp3        17.050608 -5.296691   5.894963  0.390705 -20.983192
-    
-                          39_max   39_mean     39_std   39_skew  train
-    filename
-    classical_1.mp3    49.352516 -2.282116  15.285639  0.171462   True
-    classical_10.mp3   27.533710  0.952658  10.477735 -0.185771   True
-    classical_100.mp3  31.397880 -1.494916  10.917299  0.020985   True
-    classical_11.mp3   50.992897  1.600777  10.125545  0.595763   True
-    classical_12.mp3   46.878204  1.844494  11.160392  0.503120  False
-    ...                      ...       ...        ...       ...    ...
-    rock_95.mp3        23.410387 -4.502398   6.687984  0.238807   True
-    rock_96.mp3        21.814402 -8.249507   7.807756  0.071968   True
-    rock_97.mp3        25.410654 -3.356614   8.170526  0.160330   True
-    rock_98.mp3        13.948638 -2.664985   5.051498 -0.258407   True
-    rock_99.mp3        29.312023 -0.321836   6.571660  0.384794   True
-    
-    [400 rows x 202 columns]
-
 %% Cell type:code id:265d042f tags:

 ``` python
 train: DataFrame = joined[joined["train"] == True].drop("train", axis=1)
 train
 ```

-%% Output
-
-                           label      0_min       0_max      0_mean      0_std  \
-    filename
-    classical_1.mp3    classical -530.78436 -163.308350 -302.203167  51.142183
-    classical_10.mp3   classical -562.85785  -96.164795 -219.259016  53.561838
-    classical_100.mp3  classical -536.23737  -61.608826 -177.804114  83.381622
-    classical_11.mp3   classical -536.45746 -120.429665 -222.126303  76.246992
-    classical_13.mp3   classical -637.72064 -177.713960 -361.834032  71.310080
-    ...                      ...        ...         ...         ...        ...
-    rock_95.mp3             rock -553.11010   -5.218835 -193.506047  76.869437
-    rock_96.mp3             rock -541.23600   27.163334 -119.113996  58.420684
-    rock_97.mp3             rock -518.49500   58.526745  -66.267744  65.635619
-    rock_98.mp3             rock -518.64307   53.555115  -45.734517  52.444200
-    rock_99.mp3             rock -544.70310   75.612130  -49.380943  54.045627
-    
-                         0_skew      1_min      1_max      1_mean      1_std  ...  \
-    filename                                                                  ...
-    classical_1.mp3   -0.468374   0.000000  178.75162  111.332342  24.847563  ...
-    classical_10.mp3  -0.772320   0.029056  259.63270  215.094182  18.388131  ...
-    classical_100.mp3 -2.587179   0.000000  190.47589  112.471713  27.277553  ...
-    classical_11.mp3  -2.402418   0.000000  159.42575   99.853645  21.916949  ...
-    classical_13.mp3   0.008325   0.000000  257.16284  211.556558  20.347034  ...
-    ...                     ...        ...        ...         ...        ...  ...
-    rock_95.mp3       -0.201055 -89.948746  201.18045  111.724191  36.463584  ...
-    rock_96.mp3       -0.957699  -7.415961  210.49246  125.453699  31.908869  ...
-    rock_97.mp3       -0.898026 -58.824410  175.20135   99.288265  25.158416  ...
-    rock_98.mp3       -1.705641   0.000000  187.04274   96.440874  24.137702  ...
-    rock_99.mp3       -0.863093 -32.930653  191.73538   93.971242  33.410220  ...
-    
-                          38_min     38_max   38_mean     38_std   38_skew  \
-    filename
-    classical_1.mp3   -44.098070  47.308060 -3.713503  16.553984  0.230691
-    classical_10.mp3  -27.458416  29.811110  0.484271   8.660648 -0.479016
-    classical_100.mp3 -27.335688  27.610388 -0.333233   8.185075  0.208425
-    classical_11.mp3  -31.774948  31.500881 -3.781627   9.191043  0.260886
-    classical_13.mp3  -24.728806  18.424036 -0.275736   7.026148 -0.640964
-    ...                      ...        ...       ...        ...       ...
-    rock_95.mp3       -27.043941  22.451445 -7.234634   8.471853  0.753855
-    rock_96.mp3       -37.584858  28.087936 -9.704238   8.447620  0.112760
-    rock_97.mp3       -29.620445  26.325895 -5.722825   7.727378  0.207489
-    rock_98.mp3       -26.967848   8.714737 -9.511491   5.551820 -0.025604
-    rock_99.mp3       -21.929403  17.050608 -5.296691   5.894963  0.390705
-    
-                          39_min     39_max   39_mean     39_std   39_skew
-    filename
-    classical_1.mp3   -46.794480  49.352516 -2.282116  15.285639  0.171462
-    classical_10.mp3  -28.989983  27.533710  0.952658  10.477735 -0.185771
-    classical_100.mp3 -38.095375  31.397880 -1.494916  10.917299  0.020985
-    classical_11.mp3  -22.667440  50.992897  1.600777  10.125545  0.595763
-    classical_13.mp3  -24.319565  18.439262 -2.147022   8.171929  0.009566
-    ...                      ...        ...       ...        ...       ...
-    rock_95.mp3       -24.712723  23.410387 -4.502398   6.687984  0.238807
-    rock_96.mp3       -38.147890  21.814402 -8.249507   7.807756  0.071968
-    rock_97.mp3       -29.497524  25.410654 -3.356614   8.170526  0.160330
-    rock_98.mp3       -23.020084  13.948638 -2.664985   5.051498 -0.258407
-    rock_99.mp3       -20.983192  29.312023 -0.321836   6.571660  0.384794
-    
-    [320 rows x 201 columns]
-
 %% Cell type:code id:1649ce52 tags:

 ``` python
 test: DataFrame = joined[joined["train"] == False].drop("train", axis=1)
 test
 ```

-%% Output
-
-                          label      0_min       0_max      0_mean       0_std  \
-    filename
-    classical_12.mp3  classical -562.67523 -148.133560 -270.975406   52.191182
-    classical_2.mp3   classical -549.40650 -192.532060 -293.008969   27.207028
-    classical_20.mp3  classical -605.99150 -161.119310 -263.483084   49.157298
-    classical_27.mp3  classical -595.41895  -78.118810 -265.344461  104.892303
-    classical_39.mp3  classical -578.84720  -55.479320 -183.753039   69.140628
-    ...                     ...        ...         ...         ...         ...
-    rock_85.mp3            rock -556.08203   44.890602  -72.618399   80.272023
-    rock_86.mp3            rock -534.40650   42.919650  -93.601685   62.192619
-    rock_88.mp3            rock -539.97880   44.375150 -126.955020   88.140999
-    rock_92.mp3            rock -532.89110   13.948147 -206.891688   80.812274
-    rock_93.mp3            rock -570.46650  -26.067888 -302.483118   96.569376
-    
-                        0_skew      1_min      1_max      1_mean      1_std  ...  \
-    filename                                                                 ...
-    classical_12.mp3 -0.366586   0.000000  194.26416  148.226647  19.305008  ...
-    classical_2.mp3  -0.426848   0.000000  231.03738  198.662514  14.957660  ...
-    classical_20.mp3 -0.856221   0.000000  191.92676  141.393817  17.754779  ...
-    classical_27.mp3 -0.526604   0.000000  200.61633  144.208488  25.198761  ...
-    classical_39.mp3 -0.577055   0.000000  193.84949  127.058496  29.295691  ...
-    ...                    ...        ...        ...         ...        ...  ...
-    rock_85.mp3      -2.269420 -13.219891  205.14955   96.863927  38.352424  ...
-    rock_86.mp3      -0.869415   0.000000  206.32501  128.047509  30.374850  ...
-    rock_88.mp3      -1.700578 -19.007393  201.99960   99.760978  32.572320  ...
-    rock_92.mp3       0.090286 -47.724570  179.76506  109.954998  37.880477  ...
-    rock_93.mp3       0.159026 -89.999680  211.88910  103.686365  40.373592  ...
-    
-                         38_min     38_max   38_mean     38_std   38_skew  \
-    filename
-    classical_12.mp3 -44.843810  28.490644 -6.242015  10.546545  0.341848
-    classical_2.mp3  -25.912933  24.293318  0.746096   8.240027 -0.022513
-    classical_20.mp3 -24.911243  38.551230 -2.274261   9.671005  0.719436
-    classical_27.mp3 -28.797087  20.897750 -5.761607   7.108055  0.360305
-    classical_39.mp3 -48.678460  24.566566 -7.810246  11.568188 -0.106704
-    ...                     ...        ...       ...        ...       ...
-    rock_85.mp3      -22.633102  13.513550 -3.126545   5.035097 -0.035805
-    rock_86.mp3      -30.471783  20.564953 -3.383356   6.405211 -0.185147
-    rock_88.mp3      -34.726500  26.706833 -5.827121   8.260717  0.275225
-    rock_92.mp3      -37.614220  21.420666 -8.287362   7.851784 -0.080285
-    rock_93.mp3      -28.903786  35.712753  2.073339  10.995769  0.249798
-    
-                         39_min     39_max   39_mean     39_std   39_skew
-    filename
-    classical_12.mp3 -25.040888  46.878204  1.844494  11.160392  0.503120
-    classical_2.mp3  -18.561390  23.484133  3.115819   7.220346  0.242364
-    classical_20.mp3 -30.311798  29.272330  0.289613   9.590299 -0.244191
-    classical_27.mp3 -39.705540  25.803795 -2.736776  10.101577 -0.463730
-    classical_39.mp3 -24.328775  40.172250 -0.078006  10.646963  0.492488
-    ...                     ...        ...       ...        ...       ...
-    rock_85.mp3      -19.814285  18.576450 -1.172361   6.078238 -0.048851
-    rock_86.mp3      -28.917618  26.702751 -1.950565   6.725107 -0.253487
-    rock_88.mp3      -31.036520  27.423218 -4.715363   6.544117  0.184718
-    rock_92.mp3      -41.547260  25.628895 -9.046777   8.779821  0.071449
-    rock_93.mp3      -30.178170  30.612560 -4.677735   8.877041  0.149639
-    
-    [80 rows x 201 columns]
-
-%% Cell type:code id:1e904bf3 tags:
+%% Cell type:code id:1c01673464cb048e tags:

 ``` python
 # remove labels
 X = train.drop(['label'], axis=1, errors='ignore')

 columns: Index = X.columns
 classnames = np.sort(np.unique(joined.label.values)) # -> ["classical", "electronic", "pop", "rock"]

 # map classname to an index and create dicts for easy lookup in O(1)
 classname2index = {}
 index2classname = {}

 for i, classname in enumerate(classnames):
    classname2index[classname] = i
    index2classname[i] = classname

 # map label to label index
 y = np.array([classname2index[classname] for classname in train.label.values])

 (X, y)
 ```

-%% Output
-
-    (                       0_min       0_max      0_mean      0_std    0_skew  \
-     filename
-     classical_1.mp3   -530.78436 -163.308350 -302.203167  51.142183 -0.468374
-     classical_10.mp3  -562.85785  -96.164795 -219.259016  53.561838 -0.772320
-     classical_100.mp3 -536.23737  -61.608826 -177.804114  83.381622 -2.587179
-     classical_11.mp3  -536.45746 -120.429665 -222.126303  76.246992 -2.402418
-     classical_13.mp3  -637.72064 -177.713960 -361.834032  71.310080  0.008325
-     ...                      ...         ...         ...        ...       ...
-     rock_95.mp3       -553.11010   -5.218835 -193.506047  76.869437 -0.201055
-     rock_96.mp3       -541.23600   27.163334 -119.113996  58.420684 -0.957699
-     rock_97.mp3       -518.49500   58.526745  -66.267744  65.635619 -0.898026
-     rock_98.mp3       -518.64307   53.555115  -45.734517  52.444200 -1.705641
-     rock_99.mp3       -544.70310   75.612130  -49.380943  54.045627 -0.863093
-    
-                            1_min      1_max      1_mean      1_std    1_skew  ...  \
-     filename                                                                  ...
-     classical_1.mp3     0.000000  178.75162  111.332342  24.847563 -0.402642  ...
-     classical_10.mp3    0.029056  259.63270  215.094182  18.388131 -1.528751  ...
-     classical_100.mp3   0.000000  190.47589  112.471713  27.277553 -1.318523  ...
-     classical_11.mp3    0.000000  159.42575   99.853645  21.916949 -1.176922  ...
-     classical_13.mp3    0.000000  257.16284  211.556558  20.347034 -1.050119  ...
-     ...                      ...        ...         ...        ...       ...  ...
-     rock_95.mp3       -89.948746  201.18045  111.724191  36.463584 -0.443224  ...
-     rock_96.mp3        -7.415961  210.49246  125.453699  31.908869 -0.547469  ...
-     rock_97.mp3       -58.824410  175.20135   99.288265  25.158416 -0.568057  ...
-     rock_98.mp3         0.000000  187.04274   96.440874  24.137702 -0.145217  ...
-     rock_99.mp3       -32.930653  191.73538   93.971242  33.410220  0.040113  ...
-    
-                           38_min     38_max   38_mean     38_std   38_skew  \
-     filename
-     classical_1.mp3   -44.098070  47.308060 -3.713503  16.553984  0.230691
-     classical_10.mp3  -27.458416  29.811110  0.484271   8.660648 -0.479016
-     classical_100.mp3 -27.335688  27.610388 -0.333233   8.185075  0.208425
-     classical_11.mp3  -31.774948  31.500881 -3.781627   9.191043  0.260886
-     classical_13.mp3  -24.728806  18.424036 -0.275736   7.026148 -0.640964
-     ...                      ...        ...       ...        ...       ...
-     rock_95.mp3       -27.043941  22.451445 -7.234634   8.471853  0.753855
-     rock_96.mp3       -37.584858  28.087936 -9.704238   8.447620  0.112760
-     rock_97.mp3       -29.620445  26.325895 -5.722825   7.727378  0.207489
-     rock_98.mp3       -26.967848   8.714737 -9.511491   5.551820 -0.025604
-     rock_99.mp3       -21.929403  17.050608 -5.296691   5.894963  0.390705
-    
-                           39_min     39_max   39_mean     39_std   39_skew
-     filename
-     classical_1.mp3   -46.794480  49.352516 -2.282116  15.285639  0.171462
-     classical_10.mp3  -28.989983  27.533710  0.952658  10.477735 -0.185771
-     classical_100.mp3 -38.095375  31.397880 -1.494916  10.917299  0.020985
-     classical_11.mp3  -22.667440  50.992897  1.600777  10.125545  0.595763
-     classical_13.mp3  -24.319565  18.439262 -2.147022   8.171929  0.009566
-     ...                      ...        ...       ...        ...       ...
-     rock_95.mp3       -24.712723  23.410387 -4.502398   6.687984  0.238807
-     rock_96.mp3       -38.147890  21.814402 -8.249507   7.807756  0.071968
-     rock_97.mp3       -29.497524  25.410654 -3.356614   8.170526  0.160330
-     rock_98.mp3       -23.020084  13.948638 -2.664985   5.051498 -0.258407
-     rock_99.mp3       -20.983192  29.312023 -0.321836   6.571660  0.384794
-    
-     [320 rows x 200 columns],
-     array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-            1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-            1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-            1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-            1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
-            2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
-            2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
-            2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
-            2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
-            3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
-            3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
-            3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3]))
-
-%% Cell type:code id:32e5e889 tags:
+%% Cell type:code id:41ce60fbed0a23bc tags:

 ``` python
 X_test = test.drop(['label'], axis=1, errors='ignore')

 print(X.shape)
 print(X_test.shape)
 print(X_test.shape[0] / X.shape[0])     # fraction of test sample

 y_test = np.array([classname2index[classname] for classname in test.label.values])
 y_test
 ```

-%% Output
-
-    (320, 200)
-    (80, 200)
-    0.25
-
-    array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
-           1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2,
-           2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3,
-           3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3])
-
-%% Cell type:code id:e165922f tags:
+%% Cell type:code id:99dc29024df3d251 tags:

 ``` python
 # Standardize for PCA
 scaler = StandardScaler()
 X_standardized = scaler.fit_transform(X.values)
 X_test_standardized = scaler.transform(X_test.values)

 X_standardized
 ```

-%% Output
-
-    array([[ 0.38209988, -1.79901606, -1.34294124, ..., -0.7312519 ,
-             3.4358529 ,  0.11530124],
-           [-0.42728837, -0.93236007, -0.41652953, ...,  0.22563011,
-             1.37555438, -0.86835549],
-           [ 0.24449084, -0.48632861,  0.04648451, ..., -0.49838941,
-             1.56391778, -0.29904453],
-           ...,
-           [ 0.69222714,  1.06432227,  1.29224565, ..., -1.0491004 ,
-             0.38686173,  0.08464998],
-           [ 0.68849053,  1.00015092,  1.52158336, ..., -0.84450893,
-            -0.94971424, -1.06836048],
-           [ 0.03085452,  1.28485202,  1.48085606, ..., -0.15137928,
-            -0.29828957,  0.70271937]])
-
-%% Cell type:code id:d389fd70 tags:
+%% Cell type:code id:3f30e11dc4688246 tags:

 ``` python
 # Reduce Dimensions via PCA
-pca = PCA(n_components=50).fit(X_standardized)
+pca = PCA(n_components=30).fit(X_standardized)
 X_pca = pca.transform(X_standardized)
 X_test_pca = pca.transform(X_test_standardized)

 print(sum(pca.explained_variance_ratio_))
 print(X_pca.shape)
 print(X_test_pca.shape)
 print(y.shape)
 ```

-%% Output
-
-    0.8557392011152061
-    (320, 50)
-    (80, 50)
-    (320,)
-
-%% Cell type:code id:aa1d9036 tags:
+%% Cell type:code id:21bf974f979ae1f4 tags:

 ``` python
 # Fit SVM:

 X_train, X_val, y_train, y_val = train_test_split(X_pca, y, test_size = 0.2, random_state=4, shuffle = True)

 clf = SVC(kernel='rbf', probability=True)
 clf.fit(X_train, y_train)

 print(accuracy_score(clf.predict(X_val), y_val))
-print(X_val)
-print(y_val)
 ```

-%% Output
-
-    0.6875
-    [[-4.64558613  3.08838305 -1.47175688 ... -1.24828691 -0.70095473
-       0.01689286]
-     [ 5.85968202 -2.1047151  -3.35419664 ... -1.48822402  1.00205068
-      -0.98882563]
-     [ 6.52471238 -2.88386219 -5.91379963 ...  0.08618421  0.03366275
-      -0.55189302]
-     ...
-     [ 5.3496866   3.90245458 -4.07128854 ... -0.82356091 -0.7968544
-       0.26045289]
-     [ 6.68981697 -1.18340439 -0.12267599 ...  1.33593613 -2.8015435
-       0.5028293 ]
-     [-4.78063681 -7.16377441  4.09506551 ... -1.0308011   0.83671387
-      -0.07027211]]
-    [3 0 3 2 3 0 1 2 0 3 0 0 0 1 2 1 2 3 1 1 1 0 3 0 0 0 3 1 1 3 3 2 3 1 2 1 0
-     1 0 1 3 0 0 0 0 3 3 3 0 3 3 3 1 2 2 0 1 2 1 2 3 2 1 0]
-
-%% Cell type:code id:fc48c86e tags:
+%% Cell type:code id:6099c8ae2b4be921 tags:

 ``` python
 # grid for C, gamma
-C_grid = [0.001, 0.01, 0.1, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
+C_grid = [0.0001, 0.001, 0.01, 0.1, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
 gamma_grid = [0.001, 0.01, 0.1, 1, 10]
 param_grid = {'C': C_grid, 'gamma': gamma_grid}

 grid = GridSearchCV(SVC(kernel='rbf'), param_grid, cv=5, scoring="accuracy")
-grid.fit(X_train, y_train)
+grid.fit(X_pca, y)

 # Find the best model
 print(grid.best_score_)
 print(grid.best_params_)
 print(grid.best_estimator_)
-print(accuracy_score(grid.predict(X_val), y_val))
 ```

-%% Output
-
-    0.7343891402714932
-    {'C': 3, 'gamma': 0.01}
-    SVC(C=3, gamma=0.01)
-    0.78125
-
-%% Cell type:code id:7cf87469 tags:
+%% Cell type:code id:43a8791efe8809f4 tags:

 ``` python
-# Optimal model
+# Fit entire training sets with optimal model

 clf = SVC(kernel='rbf', C=4, gamma=0.01, probability=True)
-clf.fit(X_train, y_train)
-
-print(accuracy_score(clf.predict(X_val), y_val))
-```
-
-%% Output
-
-    0.78125
-
-%% Cell type:code id:5a754cd1 tags:
-
-``` python
-# Fit entire training sets
 clf.fit(X_pca, y)
+proba = clf.predict_proba(X_test_pca)

-print(accuracy_score(clf.predict(X_test_pca), y_test))
-print(clf.predict_proba(X_test_pca))
+print(f"Accuracy score: {accuracy_score(clf.predict(X_test_pca), y_test)}")
 ```

-%% Output
-
-    0.8
-    [[9.60125451e-01 2.54410379e-02 1.00183548e-02 4.41515609e-03]
-     [9.93544791e-01 4.04634019e-03 1.20649558e-03 1.20237342e-03]
-     [9.97430192e-01 1.76800719e-04 5.38565546e-04 1.85444214e-03]
-     [9.79967977e-01 6.86113735e-03 9.68497114e-03 3.48591496e-03]
-     [9.91884967e-01 5.17290348e-03 1.26266158e-03 1.67946793e-03]
-     [9.85578464e-01 9.44992493e-03 3.75086068e-03 1.22075036e-03]
-     [2.04862989e-01 4.53621014e-01 1.34373358e-01 2.07142639e-01]
-     [9.99181855e-01 4.86945868e-04 2.22608725e-04 1.08590413e-04]
-     [9.92658119e-01 3.47218548e-03 2.74696376e-03 1.12273207e-03]
-     [9.99656357e-01 1.12727916e-04 1.43400994e-04 8.75138776e-05]
-     [8.47319131e-01 4.69014094e-02 7.09411516e-02 3.48383077e-02]
-     [1.28380278e-01 3.67332428e-01 3.59429595e-01 1.44857699e-01]
-     [9.96413445e-01 2.75890076e-03 4.65504357e-04 3.62150045e-04]
-     [9.98826125e-01 7.62447290e-04 3.01490088e-04 1.09937383e-04]
-     [9.99401836e-01 8.67850526e-05 3.74373911e-04 1.37005308e-04]
-     [9.97955498e-01 1.69931669e-03 1.73626292e-04 1.71558652e-04]
-     [8.45643860e-01 1.33426916e-02 9.97412359e-02 4.12722121e-02]
-     [9.82092462e-01 1.15346135e-02 3.19973757e-03 3.17318740e-03]
-     [9.83213850e-01 1.24420959e-02 3.26304918e-03 1.08100527e-03]
-     [9.99642856e-01 7.19184901e-05 1.55316717e-04 1.29908898e-04]
-     [9.97979494e-01 1.76870557e-03 1.31807873e-04 1.19992584e-04]
-     [4.92333515e-04 9.38096306e-01 2.10469538e-02 4.03644064e-02]
-     [9.45551189e-03 4.32699483e-01 4.16341606e-01 1.41503399e-01]
-     [9.13893710e-03 4.44229440e-01 3.15860710e-01 2.30770912e-01]
-     [6.79828415e-02 6.71681498e-01 2.09457159e-01 5.08785014e-02]
-     [1.68076034e-04 9.71769830e-01 2.24441690e-03 2.58176775e-02]
-     [5.73737808e-02 8.61494512e-02 5.86365884e-01 2.70110884e-01]
-     [1.18603200e-01 5.68582627e-01 2.33418558e-01 7.93956149e-02]
-     [1.11117289e-02 9.36048570e-01 2.07419839e-02 3.20977167e-02]
-     [4.27128683e-03 2.53015466e-01 4.52073691e-01 2.90639556e-01]
-     [8.49595708e-03 6.37021927e-01 1.52099758e-01 2.02382358e-01]
-     [9.29855946e-04 8.43628458e-01 1.67412440e-02 1.38700442e-01]
-     [5.75440080e-02 6.65893968e-01 1.18869183e-01 1.57692841e-01]
-     [7.28891949e-02 6.97755501e-01 1.23916666e-01 1.05438637e-01]
-     [1.00364172e-01 3.05951082e-01 4.02534596e-01 1.91150150e-01]
-     [2.71956862e-04 5.43067021e-01 1.43066793e-02 4.42354343e-01]
-     [8.60586155e-02 8.06134589e-02 6.12157762e-01 2.21170163e-01]
-     [4.54205646e-02 3.77922605e-02 7.46222645e-01 1.70564530e-01]
-     [2.60732219e-02 1.78887893e-01 3.03253706e-01 4.91785179e-01]
-     [1.76685545e-01 1.49702306e-01 5.30947449e-01 1.42664700e-01]
-     [2.10423538e-02 3.16261307e-02 6.86655601e-01 2.60675914e-01]
-     [5.10365555e-03 9.06077798e-03 3.10609892e-01 6.75225674e-01]
-     [1.85590659e-04 4.20187052e-01 2.54067881e-01 3.25559476e-01]
-     [1.84121015e-03 1.49368051e-03 5.94696830e-01 4.01968279e-01]
-     [9.94756099e-03 1.98337895e-02 6.10189918e-01 3.60028732e-01]
-     [1.06218859e-02 5.83443846e-02 4.09385718e-01 5.21648011e-01]
-     [2.51610276e-01 1.06475171e-01 4.02323327e-01 2.39591226e-01]
-     [1.05739190e-03 4.80039248e-03 7.84298209e-01 2.09844007e-01]
-     [1.20304373e-03 2.49929289e-03 4.25498367e-01 5.70799297e-01]
-     [5.17165422e-04 2.44187897e-03 7.70942808e-01 2.26098148e-01]
-     [1.48279902e-01 4.34212254e-01 3.33486768e-01 8.40210765e-02]
-     [6.49493657e-03 2.03203941e-03 6.76591245e-01 3.14881779e-01]
-     [1.42643647e-03 3.00507802e-02 7.66466942e-01 2.02055842e-01]
-     [2.71205953e-04 1.64674206e-03 5.18908081e-01 4.79173971e-01]
-     [6.18460044e-04 8.65733199e-03 7.31160871e-01 2.59563337e-01]
-     [5.99851686e-04 9.88068783e-03 3.18075020e-01 6.71444441e-01]
-     [8.92857719e-05 2.49912334e-03 8.22928402e-01 1.74483188e-01]
-     [4.08821963e-03 4.01685411e-03 2.22308630e-01 7.69586296e-01]
-     [3.85280110e-04 4.28844983e-03 4.38873417e-01 5.56452853e-01]
-     [7.77946831e-04 9.39309422e-03 1.89573855e-01 8.00255104e-01]
-     [1.07826925e-03 4.48667610e-03 1.68966113e-01 8.25468942e-01]
-     [4.32984844e-03 3.71263242e-02 1.74061879e-01 7.84481948e-01]
-     [8.91964233e-04 4.60229508e-03 2.56203571e-01 7.38302169e-01]
-     [1.53170345e-04 2.66905629e-03 8.05893086e-01 1.91284687e-01]
-     [3.76678169e-04 2.66687172e-02 1.35691366e-01 8.37263238e-01]
-     [1.87189571e-03 2.95477730e-02 1.83614398e-01 7.84965933e-01]
-     [3.65699757e-04 4.65723230e-02 1.96467002e-01 7.56594975e-01]
-     [3.91020418e-03 2.21215837e-02 3.46096170e-01 6.27872042e-01]
-     [3.53128321e-04 1.26062549e-03 4.04030924e-01 5.94355323e-01]
-     [3.85531972e-04 1.67060179e-03 5.14520249e-01 4.83423617e-01]
-     [4.01176053e-04 1.39364758e-03 5.62411421e-01 4.35793755e-01]
-     [2.19890976e-02 4.13933530e-01 3.17505597e-01 2.46571775e-01]
-     [2.63540892e-03 1.60423321e-02 1.69895446e-01 8.11426813e-01]
-     [5.95478507e-04 7.12069104e-04 9.01272706e-02 9.08565182e-01]
-     [2.56904495e-04 3.92709426e-03 3.41668674e-01 6.54147328e-01]
-     [3.34122792e-04 5.02991556e-03 3.01652248e-01 6.92983714e-01]
-     [1.74105457e-03 1.54657507e-02 2.27888902e-01 7.54904293e-01]
-     [3.34518377e-02 5.51052761e-02 3.32962366e-01 5.78480520e-01]
-     [1.16808056e-03 1.31231889e-03 1.63219289e-01 8.34300311e-01]
-     [8.88813523e-02 1.55465620e-01 3.86988580e-01 3.68664447e-01]]
-
-%% Cell type:code id:bbd99cb8 tags:
-
-``` python
-# svc_path = BASE_PATH / "out" / "SVC"/ "clf.pickle"
-# svc_path.parent.mkdir(parents=True, exist_ok=True)
-#
-# with open(svc_path, "wb") as file:
-#     pickle.dump(clf, file)
-#
-# with open(svc_path, "rb") as file:
-#     loaded = pickle.load(file)
-
-# loaded.predict_proba(X_test_pca)
-```
-
-%% Cell type:code id:af3c36d2 tags:
+%% Cell type:code id:28c779539faeb27c tags:

 ``` python
 # Fit the entire training sets

 def convert_to_labels(preds, i2c, k=3):
    ans = []
    ids = []
    for p in preds:
        idx = np.argsort(p)[::-1]
        ids.append([i for i in idx[:k]])
        ans.append([i2c[i] for i in idx[:k]])

    return ans, ids

-clf.fit(X_pca, y)
 prediction_lists, percentage_lists = convert_to_labels(clf.predict_proba(X_test_pca), index2classname, k=4)

+genres = ["classical", "electronic", "pop", "rock"]
 # # Write to outputs
 subm = pd.DataFrame(index=test.index)
 subm['label'] = test.label.values
 subm['pred1'] = [prediction_list[0] for prediction_list in prediction_lists]
 subm['pred2'] = [prediction_list[1] for prediction_list in prediction_lists]
 subm['pred3'] = [prediction_list[2] for prediction_list in prediction_lists]
 subm['pred4'] = [prediction_list[3] for prediction_list in prediction_lists]

+
+proba_df = pd.DataFrame(index=test.index)
+proba_df['label'] = test.label.values
+proba_df[genres[0]] = proba[:,0:1]
+proba_df[genres[1]] = proba[:,1:2]
+proba_df[genres[2]] = proba[:,2:3]
+proba_df[genres[3]] = proba[:,3:4]
 pd.set_option('display.max_rows', None)
-print(subm)
+# print(subm)
+display(subm)
+display(proba_df)
 pd.reset_option('display.max_rows')
 ```

-%% Output
+%% Cell type:code id:a816521f533c6539 tags:
+
+``` python
+conf_matrix = pd.DataFrame(confusion_matrix(subm['label'], subm['pred1']), columns=genres, index=genres)
+
+plt.figure(dpi=200)
+display(sns.heatmap(conf_matrix, annot=True).set( xlabel="Prediction", ylabel="Actual"))
+```
+
+%% Cell type:code id:d2d7e5ef892ec807 tags:
+
+``` python
+subm_top_2 = subm.copy()
+subm_top_2["top_2"] = subm.apply(lambda row: row.get("pred2") if row.get("label") == row.get("pred2") else row.get("pred1"), axis=1)
+
+conf_matrix_top_2 = pd.DataFrame(confusion_matrix(subm['label'], subm_top_2["top_2"]), columns=genres, index=genres)
+accuracy_score_top_2 = sum(sum(conf_matrix_top_2.values * np.identity(4))) / sum(sum(conf_matrix_top_2.values))
+
+print(f"Accuracy for top 2 predictions: {accuracy_score_top_2}")
+display(sns.heatmap(conf_matrix_top_2, annot=True).set( xlabel="Prediction", ylabel="Actual"))
+```
+
+%% Cell type:code id:4433589d09bda6e5 tags:
+
+``` python
+display(sns.heatmap(proba_df.corr(numeric_only=True), vmin=-1, vmax=1, annot=True).set(title="Correlation heatmap of prediction probabilities"))
+```
+
+%% Cell type:markdown id:209e3007ae290ede tags:

-                            label       pred1       pred2       pred3       pred4
-    filename
-    classical_12.mp3    classical   classical  electronic         pop        rock
-    classical_2.mp3     classical   classical  electronic         pop        rock
-    classical_20.mp3    classical   classical        rock         pop  electronic
-    classical_27.mp3    classical   classical         pop  electronic        rock
-    classical_39.mp3    classical   classical  electronic        rock         pop
-    classical_4.mp3     classical   classical  electronic         pop        rock
-    classical_40.mp3    classical  electronic        rock   classical         pop
-    classical_46.mp3    classical   classical  electronic         pop        rock
-    classical_47.mp3    classical   classical  electronic         pop        rock
-    classical_48.mp3    classical   classical         pop  electronic        rock
-    classical_49.mp3    classical   classical         pop  electronic        rock
-    classical_52.mp3    classical  electronic         pop        rock   classical
-    classical_54.mp3    classical   classical  electronic         pop        rock
-    classical_6.mp3     classical   classical  electronic         pop        rock
-    classical_62.mp3    classical   classical         pop        rock  electronic
-    classical_67.mp3    classical   classical  electronic         pop        rock
-    classical_69.mp3    classical   classical         pop        rock  electronic
-    classical_82.mp3    classical   classical  electronic         pop        rock
-    classical_9.mp3     classical   classical  electronic         pop        rock
-    classical_92.mp3    classical   classical         pop        rock  electronic
-    classical_94.mp3    classical   classical  electronic         pop        rock
-    electronic_11.mp3  electronic  electronic        rock         pop   classical
-    electronic_20.mp3  electronic  electronic         pop        rock   classical
-    electronic_21.mp3  electronic  electronic         pop        rock   classical
-    electronic_3.mp3   electronic  electronic         pop   classical        rock
-    electronic_35.mp3  electronic  electronic        rock         pop   classical
-    electronic_36.mp3  electronic         pop        rock  electronic   classical
-    electronic_38.mp3  electronic  electronic         pop   classical        rock
-    electronic_44.mp3  electronic  electronic        rock         pop   classical
-    electronic_49.mp3  electronic         pop        rock  electronic   classical
-    electronic_55.mp3  electronic  electronic        rock         pop   classical
-    electronic_59.mp3  electronic  electronic        rock         pop   classical
-    electronic_61.mp3  electronic  electronic        rock         pop   classical
-    electronic_62.mp3  electronic  electronic         pop        rock   classical
-    electronic_63.mp3  electronic         pop  electronic        rock   classical
-    electronic_81.mp3  electronic  electronic        rock         pop   classical
-    pop_1.mp3                 pop         pop        rock  electronic   classical
-    pop_10.mp3                pop         pop        rock   classical  electronic
-    pop_100.mp3               pop        rock         pop  electronic   classical
-    pop_25.mp3                pop         pop   classical  electronic        rock
-    pop_32.mp3                pop         pop        rock  electronic   classical
-    pop_38.mp3                pop        rock         pop  electronic   classical
-    pop_39.mp3                pop  electronic        rock         pop   classical
-    pop_50.mp3                pop         pop        rock   classical  electronic
-    pop_53.mp3                pop         pop        rock  electronic   classical
-    pop_58.mp3                pop        rock         pop  electronic   classical
-    pop_61.mp3                pop         pop        rock   classical  electronic
-    pop_62.mp3                pop         pop        rock  electronic   classical
-    pop_64.mp3                pop        rock         pop  electronic   classical
-    pop_65.mp3                pop         pop        rock  electronic   classical
-    pop_70.mp3                pop  electronic         pop   classical        rock
-    pop_79.mp3                pop         pop        rock   classical  electronic
-    pop_80.mp3                pop         pop        rock  electronic   classical
-    pop_82.mp3                pop         pop        rock  electronic   classical
-    pop_85.mp3                pop         pop        rock  electronic   classical
-    pop_91.mp3                pop        rock         pop  electronic   classical
-    pop_98.mp3                pop         pop        rock  electronic   classical
-    rock_18.mp3              rock        rock         pop  electronic   classical
-    rock_2.mp3               rock        rock         pop  electronic   classical
-    rock_23.mp3              rock        rock         pop  electronic   classical
-    rock_32.mp3              rock        rock         pop  electronic   classical
-    rock_45.mp3              rock        rock         pop  electronic   classical
-    rock_46.mp3              rock        rock         pop  electronic   classical
-    rock_48.mp3              rock         pop        rock  electronic   classical
-    rock_51.mp3              rock        rock         pop  electronic   classical
-    rock_52.mp3              rock        rock         pop  electronic   classical
-    rock_57.mp3              rock        rock         pop  electronic   classical
-    rock_6.mp3               rock        rock         pop  electronic   classical
-    rock_62.mp3              rock        rock         pop  electronic   classical
-    rock_63.mp3              rock         pop        rock  electronic   classical
-    rock_66.mp3              rock         pop        rock  electronic   classical
-    rock_73.mp3              rock  electronic         pop        rock   classical
-    rock_75.mp3              rock        rock         pop  electronic   classical
-    rock_78.mp3              rock        rock         pop  electronic   classical
-    rock_80.mp3              rock        rock         pop  electronic   classical
-    rock_85.mp3              rock        rock         pop  electronic   classical
-    rock_86.mp3              rock        rock         pop  electronic   classical
-    rock_88.mp3              rock        rock         pop  electronic   classical
-    rock_92.mp3              rock        rock         pop  electronic   classical
-    rock_93.mp3              rock         pop        rock  electronic   classical
+### Interpretation of results:
+
+The confusion matrix shows the true labels on the y-axis, the predicted values on the x-axis.
+Classical music was predicted well, with 1 wrong classification for electronic.
+The most misclassifications has pop, with a true positive rate of 44.44%, due to wrong classifications towards electronic (4) and rock (6).
+A high correlation between rock and pop can also be seen in the correlation plot between prediction probabilities.
+
+%% Cell type:code id:bbd99cb8 tags:
+
+``` python
+# test pickle saving & loading
+# svc_path = BASE_PATH / "out" / "SVC"/ "clf.pickle"
+# svc_path.parent.mkdir(parents=True, exist_ok=True)
+#
+# with open(svc_path, "wb") as file:
+#     pickle.dump(clf, file)
+#
+# with open(svc_path, "rb") as file:
+#     loaded = pickle.load(file)
+
+# loaded.predict_proba(X_test_pca)
+```

 %% Cell type:code id:4a32007a tags:

 ``` python
 # output
-OUTPUT_PATH.mkdir(parents=True, exist_ok=True)
+Path(OUTPUT_PATHS["clf"]).resolve().parent.mkdir(parents=True, exist_ok=True)
+Path(OUTPUT_PATHS["submission"]).resolve().parent.mkdir(parents=True, exist_ok=True)

 with open(OUTPUT_PATHS["clf"], "wb") as file:
    pickle.dump(clf, file)
 subm.to_csv(OUTPUT_PATHS["submission"], index=False)
 ```
-
-%% Cell type:code id:99782035 tags:
-
-``` python
-# def get_result() -> pd.DataFrame:
-#     """ Return the produced artefact of this notebook """
-#     return result
-```

 %% Cell type:markdown id:5de30442 tags:

 # ML Experiment code

 # Inputs: splits & aggregated features

 %% Cell type:code id:a2eb8998 tags:

 ``` python
 import pickle
+from pathlib import Path

 import numpy as np
 import pandas as pd
 from pandas import DataFrame, Index
 from sklearn.decomposition import PCA
-from sklearn.metrics import accuracy_score
+from sklearn.metrics import accuracy_score, confusion_matrix
 from sklearn.model_selection import train_test_split, GridSearchCV
 from sklearn.preprocessing import StandardScaler
 from sklearn.svm import SVC
+import seaborn as sns
+import matplotlib.pyplot as plt

 from definitions import BASE_PATH
 ```

 %% Cell type:code id:8a8da20f tags:parameters

 ``` python
 # Tagged with 'parameters'
 INPUT_PATH = BASE_PATH / "tmp" / "5_ml_model" / "input"
 OUTPUT_PATH = BASE_PATH / "tmp" / "5_ml_model" / "output"

 INPUT_PATHS: dict[str, str] = {
    "split": (INPUT_PATH / "split.csv").__str__(),
    "features": (INPUT_PATH / "features.csv").__str__()
 }
 OUTPUT_PATHS: dict[str, str] = {
    "submission": (OUTPUT_PATH / "submission.csv").__str__(),
    "clf": (OUTPUT_PATH / "clf.pickle").__str__()
 }
 ```

-%% Cell type:code id:08b56684 tags:injected-parameters
+%% Cell type:code id:1229e75d tags:injected-parameters

 ``` python
 # Parameters
 INPUT_PATHS = {
-    "split": "/home/lukas/Programming/uni/bachelorarbeit/fairnb/tmp/5_ml_model/input/split.csv",
-    "aggregated_features": "/home/lukas/Programming/uni/bachelorarbeit/fairnb/tmp/5_ml_model/input/features.csv",
+    "split": "/home/lukas/Programming/uni/bachelorarbeit/dbrepo-ismir/tmp/5_ml_model/input/split.csv",
+    "aggregated_features": "/home/lukas/Programming/uni/bachelorarbeit/dbrepo-ismir/tmp/5_ml_model/input/features.csv",
 }
 OUTPUT_PATHS = {
-    "clf": "/home/lukas/Programming/uni/bachelorarbeit/fairnb/tmp/5_ml_model/output/ml_model.pickle",
-    "submission": "/home/lukas/Programming/uni/bachelorarbeit/fairnb/tmp/5_ml_model/output/test_result.csv",
+    "clf": "/home/lukas/Programming/uni/bachelorarbeit/dbrepo-ismir/tmp/5_ml_model/output/ml_model.pickle",
+    "submission": "/home/lukas/Programming/uni/bachelorarbeit/dbrepo-ismir/tmp/5_ml_model/output/test_result.csv",
 }
 ```

 %% Cell type:code id:6810272a tags:

 ``` python
 # input
 split: pd.DataFrame = pd.read_csv(INPUT_PATHS["split"])
 features: pd.DataFrame = pd.read_csv(INPUT_PATHS["aggregated_features"])
 ```

 %% Cell type:code id:36f06fd6 tags:

 ``` python
 joined = pd.merge(features, split, on="filename").set_index("filename")
 joined
 ```

-%% Output
-
-                           label      0_min       0_max      0_mean      0_std  \
-    filename
-    classical_1.mp3    classical -530.78436 -163.308350 -302.203167  51.142183
-    classical_10.mp3   classical -562.85785  -96.164795 -219.259016  53.561838
-    classical_100.mp3  classical -536.23737  -61.608826 -177.804114  83.381622
-    classical_11.mp3   classical -536.45746 -120.429665 -222.126303  76.246992
-    classical_12.mp3   classical -562.67523 -148.133560 -270.975406  52.191182
-    ...                      ...        ...         ...         ...        ...
-    rock_95.mp3             rock -553.11010   -5.218835 -193.506047  76.869437
-    rock_96.mp3             rock -541.23600   27.163334 -119.113996  58.420684
-    rock_97.mp3             rock -518.49500   58.526745  -66.267744  65.635619
-    rock_98.mp3             rock -518.64307   53.555115  -45.734517  52.444200
-    rock_99.mp3             rock -544.70310   75.612130  -49.380943  54.045627
-    
-                         0_skew      1_min      1_max      1_mean      1_std  ...  \
-    filename                                                                  ...
-    classical_1.mp3   -0.468374   0.000000  178.75162  111.332342  24.847563  ...
-    classical_10.mp3  -0.772320   0.029056  259.63270  215.094182  18.388131  ...
-    classical_100.mp3 -2.587179   0.000000  190.47589  112.471713  27.277553  ...
-    classical_11.mp3  -2.402418   0.000000  159.42575   99.853645  21.916949  ...
-    classical_12.mp3  -0.366586   0.000000  194.26416  148.226647  19.305008  ...
-    ...                     ...        ...        ...         ...        ...  ...
-    rock_95.mp3       -0.201055 -89.948746  201.18045  111.724191  36.463584  ...
-    rock_96.mp3       -0.957699  -7.415961  210.49246  125.453699  31.908869  ...
-    rock_97.mp3       -0.898026 -58.824410  175.20135   99.288265  25.158416  ...
-    rock_98.mp3       -1.705641   0.000000  187.04274   96.440874  24.137702  ...
-    rock_99.mp3       -0.863093 -32.930653  191.73538   93.971242  33.410220  ...
-    
-                          38_max   38_mean     38_std   38_skew     39_min  \
-    filename
-    classical_1.mp3    47.308060 -3.713503  16.553984  0.230691 -46.794480
-    classical_10.mp3   29.811110  0.484271   8.660648 -0.479016 -28.989983
-    classical_100.mp3  27.610388 -0.333233   8.185075  0.208425 -38.095375
-    classical_11.mp3   31.500881 -3.781627   9.191043  0.260886 -22.667440
-    classical_12.mp3   28.490644 -6.242015  10.546545  0.341848 -25.040888
-    ...                      ...       ...        ...       ...        ...
-    rock_95.mp3        22.451445 -7.234634   8.471853  0.753855 -24.712723
-    rock_96.mp3        28.087936 -9.704238   8.447620  0.112760 -38.147890
-    rock_97.mp3        26.325895 -5.722825   7.727378  0.207489 -29.497524
-    rock_98.mp3         8.714737 -9.511491   5.551820 -0.025604 -23.020084
-    rock_99.mp3        17.050608 -5.296691   5.894963  0.390705 -20.983192
-    
-                          39_max   39_mean     39_std   39_skew  train
-    filename
-    classical_1.mp3    49.352516 -2.282116  15.285639  0.171462   True
-    classical_10.mp3   27.533710  0.952658  10.477735 -0.185771   True
-    classical_100.mp3  31.397880 -1.494916  10.917299  0.020985   True
-    classical_11.mp3   50.992897  1.600777  10.125545  0.595763   True
-    classical_12.mp3   46.878204  1.844494  11.160392  0.503120  False
-    ...                      ...       ...        ...       ...    ...
-    rock_95.mp3        23.410387 -4.502398   6.687984  0.238807   True
-    rock_96.mp3        21.814402 -8.249507   7.807756  0.071968   True
-    rock_97.mp3        25.410654 -3.356614   8.170526  0.160330   True
-    rock_98.mp3        13.948638 -2.664985   5.051498 -0.258407   True
-    rock_99.mp3        29.312023 -0.321836   6.571660  0.384794   True
-    
-    [400 rows x 202 columns]
-
 %% Cell type:code id:265d042f tags:

 ``` python
 train: DataFrame = joined[joined["train"] == True].drop("train", axis=1)
 train
 ```

-%% Output
-
-                           label      0_min       0_max      0_mean      0_std  \
-    filename
-    classical_1.mp3    classical -530.78436 -163.308350 -302.203167  51.142183
-    classical_10.mp3   classical -562.85785  -96.164795 -219.259016  53.561838
-    classical_100.mp3  classical -536.23737  -61.608826 -177.804114  83.381622
-    classical_11.mp3   classical -536.45746 -120.429665 -222.126303  76.246992
-    classical_13.mp3   classical -637.72064 -177.713960 -361.834032  71.310080
-    ...                      ...        ...         ...         ...        ...
-    rock_95.mp3             rock -553.11010   -5.218835 -193.506047  76.869437
-    rock_96.mp3             rock -541.23600   27.163334 -119.113996  58.420684
-    rock_97.mp3             rock -518.49500   58.526745  -66.267744  65.635619
-    rock_98.mp3             rock -518.64307   53.555115  -45.734517  52.444200
-    rock_99.mp3             rock -544.70310   75.612130  -49.380943  54.045627
-    
-                         0_skew      1_min      1_max      1_mean      1_std  ...  \
-    filename                                                                  ...
-    classical_1.mp3   -0.468374   0.000000  178.75162  111.332342  24.847563  ...
-    classical_10.mp3  -0.772320   0.029056  259.63270  215.094182  18.388131  ...
-    classical_100.mp3 -2.587179   0.000000  190.47589  112.471713  27.277553  ...
-    classical_11.mp3  -2.402418   0.000000  159.42575   99.853645  21.916949  ...
-    classical_13.mp3   0.008325   0.000000  257.16284  211.556558  20.347034  ...
-    ...                     ...        ...        ...         ...        ...  ...
-    rock_95.mp3       -0.201055 -89.948746  201.18045  111.724191  36.463584  ...
-    rock_96.mp3       -0.957699  -7.415961  210.49246  125.453699  31.908869  ...
-    rock_97.mp3       -0.898026 -58.824410  175.20135   99.288265  25.158416  ...
-    rock_98.mp3       -1.705641   0.000000  187.04274   96.440874  24.137702  ...
-    rock_99.mp3       -0.863093 -32.930653  191.73538   93.971242  33.410220  ...
-    
-                          38_min     38_max   38_mean     38_std   38_skew  \
-    filename
-    classical_1.mp3   -44.098070  47.308060 -3.713503  16.553984  0.230691
-    classical_10.mp3  -27.458416  29.811110  0.484271   8.660648 -0.479016
-    classical_100.mp3 -27.335688  27.610388 -0.333233   8.185075  0.208425
-    classical_11.mp3  -31.774948  31.500881 -3.781627   9.191043  0.260886
-    classical_13.mp3  -24.728806  18.424036 -0.275736   7.026148 -0.640964
-    ...                      ...        ...       ...        ...       ...
-    rock_95.mp3       -27.043941  22.451445 -7.234634   8.471853  0.753855
-    rock_96.mp3       -37.584858  28.087936 -9.704238   8.447620  0.112760
-    rock_97.mp3       -29.620445  26.325895 -5.722825   7.727378  0.207489
-    rock_98.mp3       -26.967848   8.714737 -9.511491   5.551820 -0.025604
-    rock_99.mp3       -21.929403  17.050608 -5.296691   5.894963  0.390705
-    
-                          39_min     39_max   39_mean     39_std   39_skew
-    filename
-    classical_1.mp3   -46.794480  49.352516 -2.282116  15.285639  0.171462
-    classical_10.mp3  -28.989983  27.533710  0.952658  10.477735 -0.185771
-    classical_100.mp3 -38.095375  31.397880 -1.494916  10.917299  0.020985
-    classical_11.mp3  -22.667440  50.992897  1.600777  10.125545  0.595763
-    classical_13.mp3  -24.319565  18.439262 -2.147022   8.171929  0.009566
-    ...                      ...        ...       ...        ...       ...
-    rock_95.mp3       -24.712723  23.410387 -4.502398   6.687984  0.238807
-    rock_96.mp3       -38.147890  21.814402 -8.249507   7.807756  0.071968
-    rock_97.mp3       -29.497524  25.410654 -3.356614   8.170526  0.160330
-    rock_98.mp3       -23.020084  13.948638 -2.664985   5.051498 -0.258407
-    rock_99.mp3       -20.983192  29.312023 -0.321836   6.571660  0.384794
-    
-    [320 rows x 201 columns]
-
 %% Cell type:code id:1649ce52 tags:

 ``` python
 test: DataFrame = joined[joined["train"] == False].drop("train", axis=1)
 test
 ```

-%% Output
-
-                          label      0_min       0_max      0_mean       0_std  \
-    filename
-    classical_12.mp3  classical -562.67523 -148.133560 -270.975406   52.191182
-    classical_2.mp3   classical -549.40650 -192.532060 -293.008969   27.207028
-    classical_20.mp3  classical -605.99150 -161.119310 -263.483084   49.157298
-    classical_27.mp3  classical -595.41895  -78.118810 -265.344461  104.892303
-    classical_39.mp3  classical -578.84720  -55.479320 -183.753039   69.140628
-    ...                     ...        ...         ...         ...         ...
-    rock_85.mp3            rock -556.08203   44.890602  -72.618399   80.272023
-    rock_86.mp3            rock -534.40650   42.919650  -93.601685   62.192619
-    rock_88.mp3            rock -539.97880   44.375150 -126.955020   88.140999
-    rock_92.mp3            rock -532.89110   13.948147 -206.891688   80.812274
-    rock_93.mp3            rock -570.46650  -26.067888 -302.483118   96.569376
-    
-                        0_skew      1_min      1_max      1_mean      1_std  ...  \
-    filename                                                                 ...
-    classical_12.mp3 -0.366586   0.000000  194.26416  148.226647  19.305008  ...
-    classical_2.mp3  -0.426848   0.000000  231.03738  198.662514  14.957660  ...
-    classical_20.mp3 -0.856221   0.000000  191.92676  141.393817  17.754779  ...
-    classical_27.mp3 -0.526604   0.000000  200.61633  144.208488  25.198761  ...
-    classical_39.mp3 -0.577055   0.000000  193.84949  127.058496  29.295691  ...
-    ...                    ...        ...        ...         ...        ...  ...
-    rock_85.mp3      -2.269420 -13.219891  205.14955   96.863927  38.352424  ...
-    rock_86.mp3      -0.869415   0.000000  206.32501  128.047509  30.374850  ...
-    rock_88.mp3      -1.700578 -19.007393  201.99960   99.760978  32.572320  ...
-    rock_92.mp3       0.090286 -47.724570  179.76506  109.954998  37.880477  ...
-    rock_93.mp3       0.159026 -89.999680  211.88910  103.686365  40.373592  ...
-    
-                         38_min     38_max   38_mean     38_std   38_skew  \
-    filename
-    classical_12.mp3 -44.843810  28.490644 -6.242015  10.546545  0.341848
-    classical_2.mp3  -25.912933  24.293318  0.746096   8.240027 -0.022513
-    classical_20.mp3 -24.911243  38.551230 -2.274261   9.671005  0.719436
-    classical_27.mp3 -28.797087  20.897750 -5.761607   7.108055  0.360305
-    classical_39.mp3 -48.678460  24.566566 -7.810246  11.568188 -0.106704
-    ...                     ...        ...       ...        ...       ...
-    rock_85.mp3      -22.633102  13.513550 -3.126545   5.035097 -0.035805
-    rock_86.mp3      -30.471783  20.564953 -3.383356   6.405211 -0.185147
-    rock_88.mp3      -34.726500  26.706833 -5.827121   8.260717  0.275225
-    rock_92.mp3      -37.614220  21.420666 -8.287362   7.851784 -0.080285
-    rock_93.mp3      -28.903786  35.712753  2.073339  10.995769  0.249798
-    
-                         39_min     39_max   39_mean     39_std   39_skew
-    filename
-    classical_12.mp3 -25.040888  46.878204  1.844494  11.160392  0.503120
-    classical_2.mp3  -18.561390  23.484133  3.115819   7.220346  0.242364
-    classical_20.mp3 -30.311798  29.272330  0.289613   9.590299 -0.244191
-    classical_27.mp3 -39.705540  25.803795 -2.736776  10.101577 -0.463730
-    classical_39.mp3 -24.328775  40.172250 -0.078006  10.646963  0.492488
-    ...                     ...        ...       ...        ...       ...
-    rock_85.mp3      -19.814285  18.576450 -1.172361   6.078238 -0.048851
-    rock_86.mp3      -28.917618  26.702751 -1.950565   6.725107 -0.253487
-    rock_88.mp3      -31.036520  27.423218 -4.715363   6.544117  0.184718
-    rock_92.mp3      -41.547260  25.628895 -9.046777   8.779821  0.071449
-    rock_93.mp3      -30.178170  30.612560 -4.677735   8.877041  0.149639
-    
-    [80 rows x 201 columns]
-
-%% Cell type:code id:1e904bf3 tags:
+%% Cell type:code id:1c01673464cb048e tags:

 ``` python
 # remove labels
 X = train.drop(['label'], axis=1, errors='ignore')

 columns: Index = X.columns
 classnames = np.sort(np.unique(joined.label.values)) # -> ["classical", "electronic", "pop", "rock"]

 # map classname to an index and create dicts for easy lookup in O(1)
 classname2index = {}
 index2classname = {}

 for i, classname in enumerate(classnames):
    classname2index[classname] = i
    index2classname[i] = classname

 # map label to label index
 y = np.array([classname2index[classname] for classname in train.label.values])

 (X, y)
 ```

-%% Output
-
-    (                       0_min       0_max      0_mean      0_std    0_skew  \
-     filename
-     classical_1.mp3   -530.78436 -163.308350 -302.203167  51.142183 -0.468374
-     classical_10.mp3  -562.85785  -96.164795 -219.259016  53.561838 -0.772320
-     classical_100.mp3 -536.23737  -61.608826 -177.804114  83.381622 -2.587179
-     classical_11.mp3  -536.45746 -120.429665 -222.126303  76.246992 -2.402418
-     classical_13.mp3  -637.72064 -177.713960 -361.834032  71.310080  0.008325
-     ...                      ...         ...         ...        ...       ...
-     rock_95.mp3       -553.11010   -5.218835 -193.506047  76.869437 -0.201055
-     rock_96.mp3       -541.23600   27.163334 -119.113996  58.420684 -0.957699
-     rock_97.mp3       -518.49500   58.526745  -66.267744  65.635619 -0.898026
-     rock_98.mp3       -518.64307   53.555115  -45.734517  52.444200 -1.705641
-     rock_99.mp3       -544.70310   75.612130  -49.380943  54.045627 -0.863093
-    
-                            1_min      1_max      1_mean      1_std    1_skew  ...  \
-     filename                                                                  ...
-     classical_1.mp3     0.000000  178.75162  111.332342  24.847563 -0.402642  ...
-     classical_10.mp3    0.029056  259.63270  215.094182  18.388131 -1.528751  ...
-     classical_100.mp3   0.000000  190.47589  112.471713  27.277553 -1.318523  ...
-     classical_11.mp3    0.000000  159.42575   99.853645  21.916949 -1.176922  ...
-     classical_13.mp3    0.000000  257.16284  211.556558  20.347034 -1.050119  ...
-     ...                      ...        ...         ...        ...       ...  ...
-     rock_95.mp3       -89.948746  201.18045  111.724191  36.463584 -0.443224  ...
-     rock_96.mp3        -7.415961  210.49246  125.453699  31.908869 -0.547469  ...
-     rock_97.mp3       -58.824410  175.20135   99.288265  25.158416 -0.568057  ...
-     rock_98.mp3         0.000000  187.04274   96.440874  24.137702 -0.145217  ...
-     rock_99.mp3       -32.930653  191.73538   93.971242  33.410220  0.040113  ...
-    
-                           38_min     38_max   38_mean     38_std   38_skew  \
-     filename
-     classical_1.mp3   -44.098070  47.308060 -3.713503  16.553984  0.230691
-     classical_10.mp3  -27.458416  29.811110  0.484271   8.660648 -0.479016
-     classical_100.mp3 -27.335688  27.610388 -0.333233   8.185075  0.208425
-     classical_11.mp3  -31.774948  31.500881 -3.781627   9.191043  0.260886
-     classical_13.mp3  -24.728806  18.424036 -0.275736   7.026148 -0.640964
-     ...                      ...        ...       ...        ...       ...
-     rock_95.mp3       -27.043941  22.451445 -7.234634   8.471853  0.753855
-     rock_96.mp3       -37.584858  28.087936 -9.704238   8.447620  0.112760
-     rock_97.mp3       -29.620445  26.325895 -5.722825   7.727378  0.207489
-     rock_98.mp3       -26.967848   8.714737 -9.511491   5.551820 -0.025604
-     rock_99.mp3       -21.929403  17.050608 -5.296691   5.894963  0.390705
-    
-                           39_min     39_max   39_mean     39_std   39_skew
-     filename
-     classical_1.mp3   -46.794480  49.352516 -2.282116  15.285639  0.171462
-     classical_10.mp3  -28.989983  27.533710  0.952658  10.477735 -0.185771
-     classical_100.mp3 -38.095375  31.397880 -1.494916  10.917299  0.020985
-     classical_11.mp3  -22.667440  50.992897  1.600777  10.125545  0.595763
-     classical_13.mp3  -24.319565  18.439262 -2.147022   8.171929  0.009566
-     ...                      ...        ...       ...        ...       ...
-     rock_95.mp3       -24.712723  23.410387 -4.502398   6.687984  0.238807
-     rock_96.mp3       -38.147890  21.814402 -8.249507   7.807756  0.071968
-     rock_97.mp3       -29.497524  25.410654 -3.356614   8.170526  0.160330
-     rock_98.mp3       -23.020084  13.948638 -2.664985   5.051498 -0.258407
-     rock_99.mp3       -20.983192  29.312023 -0.321836   6.571660  0.384794
-    
-     [320 rows x 200 columns],
-     array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-            1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-            1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-            1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-            1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
-            2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
-            2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
-            2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
-            2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
-            3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
-            3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
-            3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3]))
-
-%% Cell type:code id:32e5e889 tags:
+%% Cell type:code id:41ce60fbed0a23bc tags:

 ``` python
 X_test = test.drop(['label'], axis=1, errors='ignore')

 print(X.shape)
 print(X_test.shape)
 print(X_test.shape[0] / X.shape[0])     # fraction of test sample

 y_test = np.array([classname2index[classname] for classname in test.label.values])
 y_test
 ```

-%% Output
-
-    (320, 200)
-    (80, 200)
-    0.25
-
-    array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
-           1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2,
-           2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3,
-           3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3])
-
-%% Cell type:code id:e165922f tags:
+%% Cell type:code id:99dc29024df3d251 tags:

 ``` python
 # Standardize for PCA
 scaler = StandardScaler()
 X_standardized = scaler.fit_transform(X.values)
 X_test_standardized = scaler.transform(X_test.values)

 X_standardized
 ```

-%% Output
-
-    array([[ 0.38209988, -1.79901606, -1.34294124, ..., -0.7312519 ,
-             3.4358529 ,  0.11530124],
-           [-0.42728837, -0.93236007, -0.41652953, ...,  0.22563011,
-             1.37555438, -0.86835549],
-           [ 0.24449084, -0.48632861,  0.04648451, ..., -0.49838941,
-             1.56391778, -0.29904453],
-           ...,
-           [ 0.69222714,  1.06432227,  1.29224565, ..., -1.0491004 ,
-             0.38686173,  0.08464998],
-           [ 0.68849053,  1.00015092,  1.52158336, ..., -0.84450893,
-            -0.94971424, -1.06836048],
-           [ 0.03085452,  1.28485202,  1.48085606, ..., -0.15137928,
-            -0.29828957,  0.70271937]])
-
-%% Cell type:code id:d389fd70 tags:
+%% Cell type:code id:3f30e11dc4688246 tags:

 ``` python
 # Reduce Dimensions via PCA
-pca = PCA(n_components=50).fit(X_standardized)
+pca = PCA(n_components=30).fit(X_standardized)
 X_pca = pca.transform(X_standardized)
 X_test_pca = pca.transform(X_test_standardized)

 print(sum(pca.explained_variance_ratio_))
 print(X_pca.shape)
 print(X_test_pca.shape)
 print(y.shape)
 ```

-%% Output
-
-    0.8557392011152061
-    (320, 50)
-    (80, 50)
-    (320,)
-
-%% Cell type:code id:aa1d9036 tags:
+%% Cell type:code id:21bf974f979ae1f4 tags:

 ``` python
 # Fit SVM:

 X_train, X_val, y_train, y_val = train_test_split(X_pca, y, test_size = 0.2, random_state=4, shuffle = True)

 clf = SVC(kernel='rbf', probability=True)
 clf.fit(X_train, y_train)

 print(accuracy_score(clf.predict(X_val), y_val))
-print(X_val)
-print(y_val)
 ```

-%% Output
-
-    0.6875
-    [[-4.64558613  3.08838305 -1.47175688 ... -1.24828691 -0.70095473
-       0.01689286]
-     [ 5.85968202 -2.1047151  -3.35419664 ... -1.48822402  1.00205068
-      -0.98882563]
-     [ 6.52471238 -2.88386219 -5.91379963 ...  0.08618421  0.03366275
-      -0.55189302]
-     ...
-     [ 5.3496866   3.90245458 -4.07128854 ... -0.82356091 -0.7968544
-       0.26045289]
-     [ 6.68981697 -1.18340439 -0.12267599 ...  1.33593613 -2.8015435
-       0.5028293 ]
-     [-4.78063681 -7.16377441  4.09506551 ... -1.0308011   0.83671387
-      -0.07027211]]
-    [3 0 3 2 3 0 1 2 0 3 0 0 0 1 2 1 2 3 1 1 1 0 3 0 0 0 3 1 1 3 3 2 3 1 2 1 0
-     1 0 1 3 0 0 0 0 3 3 3 0 3 3 3 1 2 2 0 1 2 1 2 3 2 1 0]
-
-%% Cell type:code id:fc48c86e tags:
+%% Cell type:code id:6099c8ae2b4be921 tags:

 ``` python
 # grid for C, gamma
-C_grid = [0.001, 0.01, 0.1, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
+C_grid = [0.0001, 0.001, 0.01, 0.1, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
 gamma_grid = [0.001, 0.01, 0.1, 1, 10]
 param_grid = {'C': C_grid, 'gamma': gamma_grid}

 grid = GridSearchCV(SVC(kernel='rbf'), param_grid, cv=5, scoring="accuracy")
-grid.fit(X_train, y_train)
+grid.fit(X_pca, y)

 # Find the best model
 print(grid.best_score_)
 print(grid.best_params_)
 print(grid.best_estimator_)
-print(accuracy_score(grid.predict(X_val), y_val))
 ```

-%% Output
-
-    0.7343891402714932
-    {'C': 3, 'gamma': 0.01}
-    SVC(C=3, gamma=0.01)
-    0.78125
-
-%% Cell type:code id:7cf87469 tags:
+%% Cell type:code id:43a8791efe8809f4 tags:

 ``` python
-# Optimal model
+# Fit entire training sets with optimal model

 clf = SVC(kernel='rbf', C=4, gamma=0.01, probability=True)
-clf.fit(X_train, y_train)
-
-print(accuracy_score(clf.predict(X_val), y_val))
-```
-
-%% Output
-
-    0.78125
-
-%% Cell type:code id:5a754cd1 tags:
-
-``` python
-# Fit entire training sets
 clf.fit(X_pca, y)
+proba = clf.predict_proba(X_test_pca)

-print(accuracy_score(clf.predict(X_test_pca), y_test))
-print(clf.predict_proba(X_test_pca))
+print(f"Accuracy score: {accuracy_score(clf.predict(X_test_pca), y_test)}")
 ```

-%% Output
-
-    0.8
-    [[9.60125451e-01 2.54410379e-02 1.00183548e-02 4.41515609e-03]
-     [9.93544791e-01 4.04634019e-03 1.20649558e-03 1.20237342e-03]
-     [9.97430192e-01 1.76800719e-04 5.38565546e-04 1.85444214e-03]
-     [9.79967977e-01 6.86113735e-03 9.68497114e-03 3.48591496e-03]
-     [9.91884967e-01 5.17290348e-03 1.26266158e-03 1.67946793e-03]
-     [9.85578464e-01 9.44992493e-03 3.75086068e-03 1.22075036e-03]
-     [2.04862989e-01 4.53621014e-01 1.34373358e-01 2.07142639e-01]
-     [9.99181855e-01 4.86945868e-04 2.22608725e-04 1.08590413e-04]
-     [9.92658119e-01 3.47218548e-03 2.74696376e-03 1.12273207e-03]
-     [9.99656357e-01 1.12727916e-04 1.43400994e-04 8.75138776e-05]
-     [8.47319131e-01 4.69014094e-02 7.09411516e-02 3.48383077e-02]
-     [1.28380278e-01 3.67332428e-01 3.59429595e-01 1.44857699e-01]
-     [9.96413445e-01 2.75890076e-03 4.65504357e-04 3.62150045e-04]
-     [9.98826125e-01 7.62447290e-04 3.01490088e-04 1.09937383e-04]
-     [9.99401836e-01 8.67850526e-05 3.74373911e-04 1.37005308e-04]
-     [9.97955498e-01 1.69931669e-03 1.73626292e-04 1.71558652e-04]
-     [8.45643860e-01 1.33426916e-02 9.97412359e-02 4.12722121e-02]
-     [9.82092462e-01 1.15346135e-02 3.19973757e-03 3.17318740e-03]
-     [9.83213850e-01 1.24420959e-02 3.26304918e-03 1.08100527e-03]
-     [9.99642856e-01 7.19184901e-05 1.55316717e-04 1.29908898e-04]
-     [9.97979494e-01 1.76870557e-03 1.31807873e-04 1.19992584e-04]
-     [4.92333515e-04 9.38096306e-01 2.10469538e-02 4.03644064e-02]
-     [9.45551189e-03 4.32699483e-01 4.16341606e-01 1.41503399e-01]
-     [9.13893710e-03 4.44229440e-01 3.15860710e-01 2.30770912e-01]
-     [6.79828415e-02 6.71681498e-01 2.09457159e-01 5.08785014e-02]
-     [1.68076034e-04 9.71769830e-01 2.24441690e-03 2.58176775e-02]
-     [5.73737808e-02 8.61494512e-02 5.86365884e-01 2.70110884e-01]
-     [1.18603200e-01 5.68582627e-01 2.33418558e-01 7.93956149e-02]
-     [1.11117289e-02 9.36048570e-01 2.07419839e-02 3.20977167e-02]
-     [4.27128683e-03 2.53015466e-01 4.52073691e-01 2.90639556e-01]
-     [8.49595708e-03 6.37021927e-01 1.52099758e-01 2.02382358e-01]
-     [9.29855946e-04 8.43628458e-01 1.67412440e-02 1.38700442e-01]
-     [5.75440080e-02 6.65893968e-01 1.18869183e-01 1.57692841e-01]
-     [7.28891949e-02 6.97755501e-01 1.23916666e-01 1.05438637e-01]
-     [1.00364172e-01 3.05951082e-01 4.02534596e-01 1.91150150e-01]
-     [2.71956862e-04 5.43067021e-01 1.43066793e-02 4.42354343e-01]
-     [8.60586155e-02 8.06134589e-02 6.12157762e-01 2.21170163e-01]
-     [4.54205646e-02 3.77922605e-02 7.46222645e-01 1.70564530e-01]
-     [2.60732219e-02 1.78887893e-01 3.03253706e-01 4.91785179e-01]
-     [1.76685545e-01 1.49702306e-01 5.30947449e-01 1.42664700e-01]
-     [2.10423538e-02 3.16261307e-02 6.86655601e-01 2.60675914e-01]
-     [5.10365555e-03 9.06077798e-03 3.10609892e-01 6.75225674e-01]
-     [1.85590659e-04 4.20187052e-01 2.54067881e-01 3.25559476e-01]
-     [1.84121015e-03 1.49368051e-03 5.94696830e-01 4.01968279e-01]
-     [9.94756099e-03 1.98337895e-02 6.10189918e-01 3.60028732e-01]
-     [1.06218859e-02 5.83443846e-02 4.09385718e-01 5.21648011e-01]
-     [2.51610276e-01 1.06475171e-01 4.02323327e-01 2.39591226e-01]
-     [1.05739190e-03 4.80039248e-03 7.84298209e-01 2.09844007e-01]
-     [1.20304373e-03 2.49929289e-03 4.25498367e-01 5.70799297e-01]
-     [5.17165422e-04 2.44187897e-03 7.70942808e-01 2.26098148e-01]
-     [1.48279902e-01 4.34212254e-01 3.33486768e-01 8.40210765e-02]
-     [6.49493657e-03 2.03203941e-03 6.76591245e-01 3.14881779e-01]
-     [1.42643647e-03 3.00507802e-02 7.66466942e-01 2.02055842e-01]
-     [2.71205953e-04 1.64674206e-03 5.18908081e-01 4.79173971e-01]
-     [6.18460044e-04 8.65733199e-03 7.31160871e-01 2.59563337e-01]
-     [5.99851686e-04 9.88068783e-03 3.18075020e-01 6.71444441e-01]
-     [8.92857719e-05 2.49912334e-03 8.22928402e-01 1.74483188e-01]
-     [4.08821963e-03 4.01685411e-03 2.22308630e-01 7.69586296e-01]
-     [3.85280110e-04 4.28844983e-03 4.38873417e-01 5.56452853e-01]
-     [7.77946831e-04 9.39309422e-03 1.89573855e-01 8.00255104e-01]
-     [1.07826925e-03 4.48667610e-03 1.68966113e-01 8.25468942e-01]
-     [4.32984844e-03 3.71263242e-02 1.74061879e-01 7.84481948e-01]
-     [8.91964233e-04 4.60229508e-03 2.56203571e-01 7.38302169e-01]
-     [1.53170345e-04 2.66905629e-03 8.05893086e-01 1.91284687e-01]
-     [3.76678169e-04 2.66687172e-02 1.35691366e-01 8.37263238e-01]
-     [1.87189571e-03 2.95477730e-02 1.83614398e-01 7.84965933e-01]
-     [3.65699757e-04 4.65723230e-02 1.96467002e-01 7.56594975e-01]
-     [3.91020418e-03 2.21215837e-02 3.46096170e-01 6.27872042e-01]
-     [3.53128321e-04 1.26062549e-03 4.04030924e-01 5.94355323e-01]
-     [3.85531972e-04 1.67060179e-03 5.14520249e-01 4.83423617e-01]
-     [4.01176053e-04 1.39364758e-03 5.62411421e-01 4.35793755e-01]
-     [2.19890976e-02 4.13933530e-01 3.17505597e-01 2.46571775e-01]
-     [2.63540892e-03 1.60423321e-02 1.69895446e-01 8.11426813e-01]
-     [5.95478507e-04 7.12069104e-04 9.01272706e-02 9.08565182e-01]
-     [2.56904495e-04 3.92709426e-03 3.41668674e-01 6.54147328e-01]
-     [3.34122792e-04 5.02991556e-03 3.01652248e-01 6.92983714e-01]
-     [1.74105457e-03 1.54657507e-02 2.27888902e-01 7.54904293e-01]
-     [3.34518377e-02 5.51052761e-02 3.32962366e-01 5.78480520e-01]
-     [1.16808056e-03 1.31231889e-03 1.63219289e-01 8.34300311e-01]
-     [8.88813523e-02 1.55465620e-01 3.86988580e-01 3.68664447e-01]]
-
-%% Cell type:code id:bbd99cb8 tags:
-
-``` python
-# svc_path = BASE_PATH / "out" / "SVC"/ "clf.pickle"
-# svc_path.parent.mkdir(parents=True, exist_ok=True)
-#
-# with open(svc_path, "wb") as file:
-#     pickle.dump(clf, file)
-#
-# with open(svc_path, "rb") as file:
-#     loaded = pickle.load(file)
-
-# loaded.predict_proba(X_test_pca)
-```
-
-%% Cell type:code id:af3c36d2 tags:
+%% Cell type:code id:28c779539faeb27c tags:

 ``` python
 # Fit the entire training sets

 def convert_to_labels(preds, i2c, k=3):
    ans = []
    ids = []
    for p in preds:
        idx = np.argsort(p)[::-1]
        ids.append([i for i in idx[:k]])
        ans.append([i2c[i] for i in idx[:k]])

    return ans, ids

-clf.fit(X_pca, y)
 prediction_lists, percentage_lists = convert_to_labels(clf.predict_proba(X_test_pca), index2classname, k=4)

+genres = ["classical", "electronic", "pop", "rock"]
 # # Write to outputs
 subm = pd.DataFrame(index=test.index)
 subm['label'] = test.label.values
 subm['pred1'] = [prediction_list[0] for prediction_list in prediction_lists]
 subm['pred2'] = [prediction_list[1] for prediction_list in prediction_lists]
 subm['pred3'] = [prediction_list[2] for prediction_list in prediction_lists]
 subm['pred4'] = [prediction_list[3] for prediction_list in prediction_lists]

+
+proba_df = pd.DataFrame(index=test.index)
+proba_df['label'] = test.label.values
+proba_df[genres[0]] = proba[:,0:1]
+proba_df[genres[1]] = proba[:,1:2]
+proba_df[genres[2]] = proba[:,2:3]
+proba_df[genres[3]] = proba[:,3:4]
 pd.set_option('display.max_rows', None)
-print(subm)
+# print(subm)
+display(subm)
+display(proba_df)
 pd.reset_option('display.max_rows')
 ```

-%% Output
+%% Cell type:code id:a816521f533c6539 tags:
+
+``` python
+conf_matrix = pd.DataFrame(confusion_matrix(subm['label'], subm['pred1']), columns=genres, index=genres)
+
+plt.figure(dpi=200)
+display(sns.heatmap(conf_matrix, annot=True).set( xlabel="Prediction", ylabel="Actual"))
+```
+
+%% Cell type:code id:d2d7e5ef892ec807 tags:
+
+``` python
+subm_top_2 = subm.copy()
+subm_top_2["top_2"] = subm.apply(lambda row: row.get("pred2") if row.get("label") == row.get("pred2") else row.get("pred1"), axis=1)
+
+conf_matrix_top_2 = pd.DataFrame(confusion_matrix(subm['label'], subm_top_2["top_2"]), columns=genres, index=genres)
+accuracy_score_top_2 = sum(sum(conf_matrix_top_2.values * np.identity(4))) / sum(sum(conf_matrix_top_2.values))
+
+print(f"Accuracy for top 2 predictions: {accuracy_score_top_2}")
+display(sns.heatmap(conf_matrix_top_2, annot=True).set( xlabel="Prediction", ylabel="Actual"))
+```
+
+%% Cell type:code id:4433589d09bda6e5 tags:
+
+``` python
+display(sns.heatmap(proba_df.corr(numeric_only=True), vmin=-1, vmax=1, annot=True).set(title="Correlation heatmap of prediction probabilities"))
+```
+
+%% Cell type:markdown id:209e3007ae290ede tags:

-                            label       pred1       pred2       pred3       pred4
-    filename
-    classical_12.mp3    classical   classical  electronic         pop        rock
-    classical_2.mp3     classical   classical  electronic         pop        rock
-    classical_20.mp3    classical   classical        rock         pop  electronic
-    classical_27.mp3    classical   classical         pop  electronic        rock
-    classical_39.mp3    classical   classical  electronic        rock         pop
-    classical_4.mp3     classical   classical  electronic         pop        rock
-    classical_40.mp3    classical  electronic        rock   classical         pop
-    classical_46.mp3    classical   classical  electronic         pop        rock
-    classical_47.mp3    classical   classical  electronic         pop        rock
-    classical_48.mp3    classical   classical         pop  electronic        rock
-    classical_49.mp3    classical   classical         pop  electronic        rock
-    classical_52.mp3    classical  electronic         pop        rock   classical
-    classical_54.mp3    classical   classical  electronic         pop        rock
-    classical_6.mp3     classical   classical  electronic         pop        rock
-    classical_62.mp3    classical   classical         pop        rock  electronic
-    classical_67.mp3    classical   classical  electronic         pop        rock
-    classical_69.mp3    classical   classical         pop        rock  electronic
-    classical_82.mp3    classical   classical  electronic         pop        rock
-    classical_9.mp3     classical   classical  electronic         pop        rock
-    classical_92.mp3    classical   classical         pop        rock  electronic
-    classical_94.mp3    classical   classical  electronic         pop        rock
-    electronic_11.mp3  electronic  electronic        rock         pop   classical
-    electronic_20.mp3  electronic  electronic         pop        rock   classical
-    electronic_21.mp3  electronic  electronic         pop        rock   classical
-    electronic_3.mp3   electronic  electronic         pop   classical        rock
-    electronic_35.mp3  electronic  electronic        rock         pop   classical
-    electronic_36.mp3  electronic         pop        rock  electronic   classical
-    electronic_38.mp3  electronic  electronic         pop   classical        rock
-    electronic_44.mp3  electronic  electronic        rock         pop   classical
-    electronic_49.mp3  electronic         pop        rock  electronic   classical
-    electronic_55.mp3  electronic  electronic        rock         pop   classical
-    electronic_59.mp3  electronic  electronic        rock         pop   classical
-    electronic_61.mp3  electronic  electronic        rock         pop   classical
-    electronic_62.mp3  electronic  electronic         pop        rock   classical
-    electronic_63.mp3  electronic         pop  electronic        rock   classical
-    electronic_81.mp3  electronic  electronic        rock         pop   classical
-    pop_1.mp3                 pop         pop        rock  electronic   classical
-    pop_10.mp3                pop         pop        rock   classical  electronic
-    pop_100.mp3               pop        rock         pop  electronic   classical
-    pop_25.mp3                pop         pop   classical  electronic        rock
-    pop_32.mp3                pop         pop        rock  electronic   classical
-    pop_38.mp3                pop        rock         pop  electronic   classical
-    pop_39.mp3                pop  electronic        rock         pop   classical
-    pop_50.mp3                pop         pop        rock   classical  electronic
-    pop_53.mp3                pop         pop        rock  electronic   classical
-    pop_58.mp3                pop        rock         pop  electronic   classical
-    pop_61.mp3                pop         pop        rock   classical  electronic
-    pop_62.mp3                pop         pop        rock  electronic   classical
-    pop_64.mp3                pop        rock         pop  electronic   classical
-    pop_65.mp3                pop         pop        rock  electronic   classical
-    pop_70.mp3                pop  electronic         pop   classical        rock
-    pop_79.mp3                pop         pop        rock   classical  electronic
-    pop_80.mp3                pop         pop        rock  electronic   classical
-    pop_82.mp3                pop         pop        rock  electronic   classical
-    pop_85.mp3                pop         pop        rock  electronic   classical
-    pop_91.mp3                pop        rock         pop  electronic   classical
-    pop_98.mp3                pop         pop        rock  electronic   classical
-    rock_18.mp3              rock        rock         pop  electronic   classical
-    rock_2.mp3               rock        rock         pop  electronic   classical
-    rock_23.mp3              rock        rock         pop  electronic   classical
-    rock_32.mp3              rock        rock         pop  electronic   classical
-    rock_45.mp3              rock        rock         pop  electronic   classical
-    rock_46.mp3              rock        rock         pop  electronic   classical
-    rock_48.mp3              rock         pop        rock  electronic   classical
-    rock_51.mp3              rock        rock         pop  electronic   classical
-    rock_52.mp3              rock        rock         pop  electronic   classical
-    rock_57.mp3              rock        rock         pop  electronic   classical
-    rock_6.mp3               rock        rock         pop  electronic   classical
-    rock_62.mp3              rock        rock         pop  electronic   classical
-    rock_63.mp3              rock         pop        rock  electronic   classical
-    rock_66.mp3              rock         pop        rock  electronic   classical
-    rock_73.mp3              rock  electronic         pop        rock   classical
-    rock_75.mp3              rock        rock         pop  electronic   classical
-    rock_78.mp3              rock        rock         pop  electronic   classical
-    rock_80.mp3              rock        rock         pop  electronic   classical
-    rock_85.mp3              rock        rock         pop  electronic   classical
-    rock_86.mp3              rock        rock         pop  electronic   classical
-    rock_88.mp3              rock        rock         pop  electronic   classical
-    rock_92.mp3              rock        rock         pop  electronic   classical
-    rock_93.mp3              rock         pop        rock  electronic   classical
+### Interpretation of results:
+
+The confusion matrix shows the true labels on the y-axis, the predicted values on the x-axis.
+Classical music was predicted well, with 1 wrong classification for electronic.
+The most misclassifications has pop, with a true positive rate of 44.44%, due to wrong classifications towards electronic (4) and rock (6).
+A high correlation between rock and pop can also be seen in the correlation plot between prediction probabilities.
+
+%% Cell type:code id:bbd99cb8 tags:
+
+``` python
+# test pickle saving & loading
+# svc_path = BASE_PATH / "out" / "SVC"/ "clf.pickle"
+# svc_path.parent.mkdir(parents=True, exist_ok=True)
+#
+# with open(svc_path, "wb") as file:
+#     pickle.dump(clf, file)
+#
+# with open(svc_path, "rb") as file:
+#     loaded = pickle.load(file)
+
+# loaded.predict_proba(X_test_pca)
+```

 %% Cell type:code id:4a32007a tags:

 ``` python
 # output
-OUTPUT_PATH.mkdir(parents=True, exist_ok=True)
+Path(OUTPUT_PATHS["clf"]).resolve().parent.mkdir(parents=True, exist_ok=True)
+Path(OUTPUT_PATHS["submission"]).resolve().parent.mkdir(parents=True, exist_ok=True)

 with open(OUTPUT_PATHS["clf"], "wb") as file:
    pickle.dump(clf, file)
 subm.to_csv(OUTPUT_PATHS["submission"], index=False)
 ```
-
-%% Cell type:code id:99782035 tags:
-
-``` python
-# def get_result() -> pd.DataFrame:
-#     """ Return the produced artefact of this notebook """
-#     return result
-```

--- a/notebooks/main.ipynb
+++ b/notebooks/main.ipynb
--- a/notebooks/standalone.ipynb
+++ b/notebooks/standalone.ipynb
--- a/poetry.lock
+++ b/poetry.lock
@@ -123,13 +123,13 @@ files = [

 [[package]]
 name = "anyio"
-version = "4.2.0"
+version = "4.3.0"
 description = "High level compatibility layer for multiple asynchronous event loop implementations"
 optional = false
 python-versions = ">=3.8"
 files = [
-    {file = "anyio-4.2.0-py3-none-any.whl", hash = "sha256:745843b39e829e108e518c489b31dc757de7d2131d53fac32bd8df268227bfee"},
-    {file = "anyio-4.2.0.tar.gz", hash = "sha256:e1875bb4b4e2de1669f4bc7869b6d3f54231cdced71605e6e64c9be77e3be50f"},
+    {file = "anyio-4.3.0-py3-none-any.whl", hash = "sha256:048e05d0f6caeed70d731f3db756d35dcc1f35747c8c403364a8332c630441b8"},
+    {file = "anyio-4.3.0.tar.gz", hash = "sha256:f75253795a87df48568485fd18cdd2a3fa5c4f7c5be8e5e36637733fce06fed6"},
 ]

 [package.dependencies]
@@ -2149,13 +2149,13 @@ test = ["flaky", "ipykernel (>=6.19.3)", "ipython", "ipywidgets", "nbconvert (>=

 [[package]]
 name = "nbconvert"
-version = "7.16.0"
-description = "Converting Jupyter Notebooks"
+version = "7.16.1"
+description = "Converting Jupyter Notebooks (.ipynb files) to other formats.  Output formats include asciidoc, html, latex, markdown, pdf, py, rst, script.  nbconvert can be used both as a Python library (`import nbconvert`) or as a command line tool (invoked as `jupyter nbconvert ...`)."
 optional = false
 python-versions = ">=3.8"
 files = [
-    {file = "nbconvert-7.16.0-py3-none-any.whl", hash = "sha256:ad3dc865ea6e2768d31b7eb6c7ab3be014927216a5ece3ef276748dd809054c7"},
-    {file = "nbconvert-7.16.0.tar.gz", hash = "sha256:813e6553796362489ae572e39ba1bff978536192fb518e10826b0e8cadf03ec8"},
+    {file = "nbconvert-7.16.1-py3-none-any.whl", hash = "sha256:3188727dffadfdc9c6a1c7250729063d7bc78b355ad7aa023138afa030d1cd07"},
+    {file = "nbconvert-7.16.1.tar.gz", hash = "sha256:e79e6a074f49ba3ed29428ed86487bf51509d9aab613bd8522ac08f6d28fd7fd"},
 ]

 [package.dependencies]
@@ -2917,19 +2917,23 @@ typing-extensions = ">=4.6.0,<4.7.0 || >4.7.0"

 [[package]]
 name = "pydantic-settings"
-version = "2.1.0"
+version = "2.2.0"
 description = "Settings management using Pydantic"
 optional = false
 python-versions = ">=3.8"
 files = [
-    {file = "pydantic_settings-2.1.0-py3-none-any.whl", hash = "sha256:7621c0cb5d90d1140d2f0ef557bdf03573aac7035948109adf2574770b77605a"},
-    {file = "pydantic_settings-2.1.0.tar.gz", hash = "sha256:26b1492e0a24755626ac5e6d715e9077ab7ad4fb5f19a8b7ed7011d52f36141c"},
+    {file = "pydantic_settings-2.2.0-py3-none-any.whl", hash = "sha256:5f7bcaf9ad4419559dc5ac155c0324a9aeb2547c60471ee7c7d026f467a6b515"},
+    {file = "pydantic_settings-2.2.0.tar.gz", hash = "sha256:648d0a76673e69c51278979cba2e83cf16a23d57519bfd7e553d1c3f37db5560"},
 ]

 [package.dependencies]
 pydantic = ">=2.3.0"
 python-dotenv = ">=0.21.0"

+[package.extras]
+toml = ["tomlkit (>=0.12)"]
+yaml = ["pyyaml (>=6.0.1)"]
+
 [[package]]
 name = "pygments"
 version = "2.17.2"
@@ -3961,13 +3965,13 @@ dev = ["flake8", "flake8-annotations", "flake8-bandit", "flake8-bugbear", "flake

 [[package]]
 name = "urllib3"
-version = "2.2.0"
+version = "2.2.1"
 description = "HTTP library with thread-safe connection pooling, file post, and more."
 optional = false
 python-versions = ">=3.8"
 files = [
-    {file = "urllib3-2.2.0-py3-none-any.whl", hash = "sha256:ce3711610ddce217e6d113a2732fafad960a03fd0318c91faa79481e35c11224"},
-    {file = "urllib3-2.2.0.tar.gz", hash = "sha256:051d961ad0c62a94e50ecf1af379c3aba230c66c710493493560c0c223c49f20"},
+    {file = "urllib3-2.2.1-py3-none-any.whl", hash = "sha256:450b20ec296a467077128bff42b73080516e71b56ff59a60a02bef2232c4fa9d"},
+    {file = "urllib3-2.2.1.tar.gz", hash = "sha256:d0570876c61ab9e520d776c38acbbb5b05a776d3f9ff98a5c8fd5162a444cf19"},
 ]

 [package.extras]

--- a/resource/1_audio_files/record_metadata.yml
+++ b/resource/1_audio_files/record_metadata.yml
 access:
-  files: public
+  files: restricted
  record: public
 files:
  default_preview: null
@@ -24,13 +24,13 @@ metadata:
  publisher: TU Wien
  related_identifiers:
  - identifier: https://www2.projects.science.uu.nl/memotion/emotifydata/
-    relation:
+    relation_type:
      id: isderivedfrom
    resource_type:
      id: sound
    scheme: url
  - identifier: https://gitlab.tuwien.ac.at/martin.weise/fairnb
-    relation:
+    relation_type:
      id: isderivedfrom
    resource_type:
      id: software
No results found