*** Wartungsfenster jeden ersten Mittwoch vormittag im Monat ***

Skip to content
Snippets Groups Projects

Compare revisions

Changes are shown as if the source revision was being merged into the target revision. Learn more about comparing revisions.

Source

Select target project
No results found

Target

Select target project
  • martin.weise/fairnb
1 result
Show changes
Commits on Source (6)
Showing
with 324 additions and 2181 deletions
host: https://dbrepo1.ec.tuwien.ac.at
database-id: <insert database id>
database-id: 23
credentials:
username: <insert username from dbrepo>
password: <insert password from dbrepo>
......
host: https://researchdata.tuwien.ac.at
credentials:
token: <insert token from invenio>
\ No newline at end of file
token: <insert token of InvenioRDM API>
\ No newline at end of file
......@@ -12,7 +12,7 @@ from keycloak import KeycloakOpenID
LOG = logging.getLogger(__name__)
TIMEOUT = 600
CHUNK_SIZE = 1024 * 1024 * 100
CHUNK_SIZE = 1024 * 1024 * 20
def re_auth(func: Callable) -> Callable:
@wraps(func)
......@@ -210,10 +210,10 @@ class DBRepoConnector:
return None
@re_auth
def create_table(self, dataframe: pd.DataFrame, table_name: str, table_descriptor: str):
def create_table(self, dataframe: pd.DataFrame, table_name: str, table_description: str):
""" Creates a new table """
data = self._create_table_data(dataframe, table_name, table_descriptor)
data = self._create_table_data(dataframe, table_name, table_description)
response = requests.post(
f"{self.host}/api/database/{self.database_id}/table",
......@@ -234,10 +234,10 @@ class DBRepoConnector:
def create_table_if_not_exists(self,
dataframe: pd.DataFrame,
table_name: str,
table_descriptor: str
table_description: str
):
table = table if (table := self.get_table(table_name)) is not None else \
self.create_table(dataframe, table_name, table_descriptor)
self.create_table(dataframe, table_name, table_description)
return table
......@@ -269,6 +269,7 @@ class DBRepoConnector:
"quote": '"',
"separator": ",",
"skip_lines": 1,
"line_termination": "\n",
"true_element": "True"
},
headers=self.headers
......
......@@ -8,7 +8,7 @@ import requests as rq
log = logging.getLogger(__name__)
class InvenioConnector:
class InvenioRDMConnector:
def __init__(self, token: str, host: str | None = None):
self.host = host or "https://test.researchdata.tuwien.ac.at"
self.token = token
......@@ -130,14 +130,14 @@ class InvenioConnector:
executor.map(lambda p: self.download_file(*p), args)
class InvenioManager:
class InvenioRDMManager:
"""A high level interface to up- and download files from invenio.
Utilizes state management:
1. record not assigned
2. record assigned
3. record published"""
def __init__(self, invenio_connector: InvenioConnector, record_id: str | None = None):
def __init__(self, invenio_connector: InvenioRDMConnector, record_id: str | None = None):
self.connector = invenio_connector
self.record_id = record_id
self.doi = None
......
......@@ -9,7 +9,7 @@ from fairnb.entity.entity_provenance import EntityProvenance
@dataclass
class DbRepoEntity(Entity):
class DBRepoEntity(Entity):
table_name: str = field(init=True, default=None)
table_description: str = field(init=True, default="")
table_id: int = field(init=False, default=None)
......@@ -19,7 +19,7 @@ class DbRepoEntity(Entity):
super().__post_init__()
if self.metadata is not None: # equivalent to: self.id is not None
self.table_id = int(self.metadata.uri.split("/")[-1])
self.table_id = int(self.metadata.pi.split("/")[-1])
else:
assert self.table_name is not None # has to exist fot the ability to get table_id
......@@ -60,9 +60,10 @@ class DbRepoEntity(Entity):
self.location.resolve().parent.mkdir(parents=True, exist_ok=True)
df.to_csv(self.location, index=False)
def upload(self, executed_file: Path, dependencies: list[Entity] = None,
def upload(self, executed_file: Path, main_file: Path,
dependencies: list[Entity] = None,
start_time: datetime = datetime.now(),
end_time: datetime = datetime.now()):
end_time: datetime = datetime.now()) -> EntityProvenance:
df = pd.read_csv(self.location)
# add id column to df:
......@@ -80,10 +81,11 @@ class DbRepoEntity(Entity):
self.name,
self.description,
executed_file=executed_file,
uri=f"{self.dbrepo_connector.host}/database/"
main_file=main_file,
pi=f"{self.dbrepo_connector.host}/database/"
f"{self.dbrepo_connector.database_id}/table/{self.table_id}",
type=self.type,
platform=self.repository,
repository=self.repository,
started_at=start_time,
ended_at=end_time
)
......@@ -98,6 +100,8 @@ class DbRepoEntity(Entity):
self.upload_dependencies(dependencies)
return self.metadata
def upload_data(self, df: pd.DataFrame):
assert self.id is not None
assert self.table_id is not None
......
......@@ -11,12 +11,13 @@ from fairnb.api.dbrepo import DBRepoConnector
from fairnb.entity.entity_provenance import EntityProvenance
PROVENANCE_TABLE_NAME = "entity_provenance_test3"
DEPENDENCY_TABLE_NAME = "entity_dependencies_test3"
PROVENANCE_TABLE_NAME = "entity_provenance"
DEPENDENCY_TABLE_NAME = "entity_dependencies"
LOG = logging.getLogger(__name__)
# TODO: Upload Datetime objects as Timestamps instead of str
@dataclass
class Entity(ABC):
""" A O-Prov Entity class used to represent an Entity created by a notebook.
......@@ -65,12 +66,12 @@ class Entity(ABC):
self.download_provenance()
@abstractmethod
def download(self) -> EntityProvenance:
def download(self):
"""Download this Entity and return the attached EntityProvenance"""
raise NotImplementedError
@abstractmethod
def upload(self, executed_file: Path, dependencies: list, started_at=datetime.now(), ended_at=datetime.now()):
def upload(self, executed_file: Path, main_file: Path, dependencies: list, started_at=datetime.now(), ended_at=datetime.now()):
"""Upload this Entity"""
raise NotImplementedError
......@@ -131,6 +132,8 @@ class Entity(ABC):
self.id = meta.id
self.metadata = meta
LOG.info(f"Uploaded provenance information for {self.name} with id {self.id}: {self.metadata}")
def upload_dependencies(self, dependencies):
""" Upload the dependency information for this Entity.
It lists all entities, which have an id, this entity depends on.
......
......@@ -15,7 +15,7 @@ class EntityProvenance:
"""
id: str | None # id of entity, always unique
uri: str # unique resource identifier used to locate entity (can also be used to point to table containing entity)
pi: str # persistent identifier used to locate entity (can also be used to point to table containing entity)
name: str # name of specific entity describing the data it contains
description: str # more detailed description of the enitity
type: str # type of entity, if notebook is run with different data type stays the same
......@@ -23,9 +23,10 @@ class EntityProvenance:
branch: str # the branch of the repository, makes manual search of commit easier
repo_uri: str # the uri of the repository, used to locate the repository
executed_file: str # path to notebook which was executed to create the entity
started_at: datetime # start time of execution where entity was created
ended_at: datetime # end time of execution where entity was created
platform: str # platform on which the entity is uploaded (e.g. dbrepo, invenio, ...)
main_file: str # path to the main file executing the notebook
started_at: datetime # start time of execution where entity was created
ended_at: datetime # end time of execution where entity was created
repository: str # platform on which the entity is uploaded (e.g. dbrepo, invenio, ...)
@classmethod
def new(
......@@ -33,9 +34,10 @@ class EntityProvenance:
name: str,
description: str,
executed_file: Path,
main_file: Path,
type: str,
uri: str,
platform: str,
pi: str,
repository: str,
started_at: datetime,
ended_at: datetime
):
......@@ -50,20 +52,22 @@ class EntityProvenance:
repo_uri = re.sub(":\d+/", "/", f"https://{repo_uri.split('@', 1)[1]}")
executed_file_rel = executed_file.resolve().relative_to(BASE_PATH)
main_file_rel = main_file.resolve().relative_to(BASE_PATH)
return cls(
id=None,
name=name,
description=description,
uri=uri,
pi=pi,
commit=commit,
repo_uri=repo_uri,
started_at=started_at,
ended_at=ended_at,
branch=branch,
executed_file=executed_file_rel.as_posix(),
main_file=main_file_rel.as_posix(),
type=type,
platform=platform,
repository=repository,
)
@classmethod
......@@ -72,10 +76,11 @@ class EntityProvenance:
id=df["id"],
name=df["name"],
description=df["description"],
uri=df["uri"],
pi=df["pi"],
commit=df["commit"],
repo_uri=df["git_uri"],
executed_file=df["executed_file"],
main_file=df["main_file"],
started_at=datetime.strptime(
df["started_at"], "%Y-%m-%d %H:%M:%S.%f"
), # TODO: replace with '%F %T'
......@@ -84,7 +89,7 @@ class EntityProvenance:
),
branch=df["branch"],
type=df["type"],
platform=df["repository"],
repository=df["repository"],
)
def to_frame(self):
......@@ -93,14 +98,15 @@ class EntityProvenance:
"id": pd.Series(self.id, dtype=str),
"name": pd.Series(self.name, dtype=str),
"description": pd.Series(self.description, dtype=str),
"uri": pd.Series(self.uri, dtype=str),
"pi": pd.Series(self.pi, dtype=str),
"commit": pd.Series(self.commit, dtype=str),
"git_uri": pd.Series(self.repo_uri, dtype=str),
"executed_file": pd.Series(self.executed_file, dtype=str),
"main_file": pd.Series(self.main_file, dtype=str),
"started_at": pd.Series(self.started_at, dtype=str),
"ended_at": pd.Series(self.ended_at, dtype=str),
"branch": pd.Series(self.branch, dtype=str),
"type": pd.Series(self.type, dtype=str),
"repository": pd.Series(self.platform, dtype=str),
"repository": pd.Series(self.repository, dtype=str),
}
)
......@@ -3,14 +3,14 @@ from datetime import datetime
from pathlib import Path
from fairnb.api.dbrepo import DBRepoConnector
from fairnb.api.invenio import InvenioManager, InvenioConnector
from fairnb.api.invenio import InvenioRDMManager, InvenioRDMConnector
from fairnb.entity.entity import Entity
from fairnb.entity.entity_provenance import EntityProvenance
@dataclass
class InvenioEntity(Entity):
invenio_manager: InvenioManager = field(init=True, default=None)
class InvenioRDMEntity(Entity):
invenio_manager: InvenioRDMManager = field(init=True, default=None)
record_metadata: dict = field(init=True, default=None)
publish_record: bool = field(init=True, default=False)
platform: str = field(init=False, default="https://doi.org/10.17616/R31NJMYD")
......@@ -24,11 +24,11 @@ class InvenioEntity(Entity):
description: str,
type: str,
dbrepo_connector: DBRepoConnector,
invenio_connector: InvenioConnector,
invenio_connector: InvenioRDMConnector,
publish_record: bool = False,
):
return cls(
invenio_manager=InvenioManager(invenio_connector),
invenio_manager=InvenioRDMManager(invenio_connector),
record_metadata=record_metadata,
dbrepo_connector=dbrepo_connector,
location=location,
......@@ -44,13 +44,13 @@ class InvenioEntity(Entity):
id: str,
location: Path,
dbrepo_connector: DBRepoConnector,
invenio_connector: InvenioConnector,
invenio_connector: InvenioRDMConnector,
):
return cls(
id=id,
location=location,
dbrepo_connector=dbrepo_connector,
invenio_manager=InvenioManager(invenio_connector)
invenio_manager=InvenioRDMManager(invenio_connector)
)
def __post_init__(self):
......@@ -60,9 +60,10 @@ class InvenioEntity(Entity):
assert self.record_metadata is not None
return
self.invenio_manager.record_id = self.metadata.uri.split('/')[-1]
self.invenio_manager.record_id = self.metadata.pi.split('/')[-1]
def upload(self, executed_file: Path, dependencies: list[Entity] = None, started_at=datetime.now(), ended_at=datetime.now()):
def upload(self, executed_file: Path, main_file: Path,
dependencies: list[Entity] = None, started_at=datetime.now(), ended_at=datetime.now()):
dir_path: Path
regex: str
......@@ -91,9 +92,10 @@ class InvenioEntity(Entity):
name=self.name,
description=self.description,
executed_file=executed_file,
uri=uri.replace('/api', ''),
main_file=main_file,
pi=uri.replace('/api', ''),
type=self.type,
platform=self.platform,
repository=self.platform,
started_at=started_at,
ended_at=ended_at,
)
......
......@@ -73,6 +73,7 @@ class Executor:
# use inspect to get path of caller
entity.upload(
nb_config.nb_location,
nb_config.main_location,
nb_config.dependencies,
nb_config.started_at,
nb_config.ended_at
......
......@@ -8,6 +8,7 @@ from fairnb.entity.entity import Entity
@dataclass
class NbConfig:
nb_location: Path
main_location: Path
entities: list[Entity]
dependencies: list[Entity]
nb_output_location: Path = field(init=True, default=None)
......
......@@ -5,7 +5,7 @@ import pandas as pd
import tarfile
from fairnb.api.dbrepo import DBRepoConnector
from fairnb.api.invenio import InvenioManager, InvenioConnector
from fairnb.api.invenio import InvenioRDMManager, InvenioRDMConnector
from definitions import CONFIG_PATH
import yaml
......@@ -46,14 +46,14 @@ class Util:
def get_invenio_connector(self, path: pathlib.Path = None):
config = self.get_config(path=path)
return InvenioConnector(
return InvenioRDMConnector(
token=config["credentials"]["token"],
host=config["host"]
)
def get_invenio_manager(self, path: pathlib.Path = None):
config = self.get_config(path=path)
return InvenioManager(
return InvenioRDMManager(
self.get_invenio_connector(path=path)
)
......
%% Cell type:markdown id:4389a8092677254e tags:
# Audio Files
Bundle the provided audio files (400, in MP3) in a tar, encrypt it using gzip and store it in the output folder.
%% Cell type:code id:87ab37c6 tags:
``` python
from definitions import BASE_PATH
import tarfile
import zipfile
import os
from pathlib import Path
```
%% Cell type:code id:1b4e6b01 tags:parameters
``` python
# Parameters
INPUT_PATHS = {}
OUTPUT_PATHS = {
"audio_tar": str(BASE_PATH / "tmp/1_audio_files/output/emotifymusic.tar.gz")
}
```
%% Cell type:code id:15dea136 tags:injected-parameters
%% Cell type:code id:1a6df3b0 tags:injected-parameters
``` python
# Parameters
INPUT_PATHS = {}
OUTPUT_PATHS = {
"audio_tar": "/home/lukas/Programming/uni/bachelorarbeit/fairnb/tmp/1_audio_files/output/emotifymusic.tar.gz"
"audio_tar": "/home/lukas/Programming/uni/bachelorarbeit/dbrepo-ismir/tmp/1_audio_files/output/emotifymusic.tar.gz"
}
```
%% Cell type:code id:1e487573 tags:
``` python
# load provided files
zip_path = BASE_PATH / "resource" / "1_audio_files" / "emotifymusic.zip"
dir_path = BASE_PATH / "tmp" / "1_audio_files" / "music"
dir_path.mkdir(parents=True, exist_ok=True)
# unzip to dir_path
with zipfile.ZipFile(zip_path, "r") as zfile:
zfile.extractall(path=dir_path)
```
%% Cell type:code id:c3193f35 tags:
``` python
file_paths = list(dir_path.rglob('**/*.*'))
flattened_dir_path = BASE_PATH / "tmp" / "1_audio_files" / "flattened"
flattened_dir_path.mkdir(parents=True, exist_ok=True)
for path in file_paths:
(flattened_dir_path / path.relative_to(dir_path).as_posix().replace('/', '_')).write_bytes(path.read_bytes())
```
%% Cell type:code id:3272ea2b tags:
``` python
tar_path = Path(OUTPUT_PATHS["audio_tar"])
tar_path.parent.mkdir(parents=True, exist_ok=True)
with tarfile.open(tar_path, "w:gz") as file:
file.add(flattened_dir_path, arcname=os.path.sep)
```
......
%% Cell type:markdown id:699a83ce tags:
# Feature Extraction of Base audio files from Invenio
%% Cell type:code id:6463a609 tags:
``` python
from contextlib import contextmanager, redirect_stderr, redirect_stdout
import pandas as pd
import librosa
import tarfile
from pathlib import Path
from concurrent.futures import ThreadPoolExecutor
from definitions import BASE_PATH
import os
```
%% Cell type:code id:f025335b tags:parameters
``` python
INPUT_PATH = BASE_PATH / "tmp" / "2_generate_features" / "input"
OUTPUT_PATH = BASE_PATH / "tmp" / "2_generate_features" / "output"
INPUT_PATHS: dict[str, str] = {
"audio_tar": (INPUT_PATH / "emotifymusic.tar.gz").__str__()
}
OUTPUT_PATHS: dict[str, str] = {
"raw_features": (OUTPUT_PATH / "raw_features.csv").__str__()
}
```
%% Cell type:code id:704afac7 tags:injected-parameters
%% Cell type:code id:f640e1a8 tags:injected-parameters
``` python
# Parameters
INPUT_PATHS = {
"audio_tar": "/home/lukas/Programming/uni/bachelorarbeit/dbrepo-ismir/tmp/2_generate_features/input/emotifymusic.tar.gz"
}
OUTPUT_PATHS = {
"raw_features": "/home/lukas/Programming/uni/bachelorarbeit/dbrepo-ismir/tmp/2_generate_features/output/raw_features.csv"
}
```
%% Cell type:code id:10f1b3cd tags:
``` python
# inputs
DEFAULT_SAMPLING_RATE = 22050
assert INPUT_PATH.exists() and INPUT_PATH.is_dir()
with tarfile.open(audio_gz := Path(INPUT_PATHS["audio_tar"]).resolve(), "r:gz") as archive:
archive.extractall(path=(path_out := audio_gz.with_suffix("").with_suffix("")))
files = list(path_out.rglob("**/*.*"))
```
%% Cell type:code id:469af6f9 tags:
``` python
@contextmanager
def suppress_stdout_stderr():
"""A context manager that redirects stdout and stderr to devnull"""
with open(os.devnull, 'w') as fnull:
with redirect_stderr(fnull) as err, redirect_stdout(fnull) as out:
yield err, out
```
%% Cell type:code id:316f6c17 tags:
``` python
def generate_mfcc_feature(filepath: Path, sr: int = DEFAULT_SAMPLING_RATE, number_mfccs: int = 40):
x, _ = load_mp3(filepath, sr=sr)
assert sr == _
mfcc = librosa.feature.mfcc(x, sr=sr, n_mfcc=number_mfccs)
# transpose to use mfcc bands as columns instead of rows
return pd.DataFrame(mfcc).transpose()
def load_mp3(filepath: Path, sr: int = DEFAULT_SAMPLING_RATE):
x, sr = librosa.load(filepath, sr=sr) # extract wave (x) with sample rate (sr)
return x, sr
with suppress_stdout_stderr(), ThreadPoolExecutor(6) as executor:
dataframes = list(executor.map(
lambda args: generate_mfcc_feature(args), files)
)
```
%% Cell type:code id:acc9bae8 tags:
``` python
for file, dataframe in zip(files, dataframes):
dataframe["sample"] = dataframe.index.to_numpy(copy=True)
dataframe["filename"] = file.name
dataframe["label"] = file.name.split('_')[0] # extract genre from file name
dataframe_concat = pd.concat(dataframes)
columns_old = list(dataframe_concat.columns)
columns = columns_old[-3:] + columns_old[:-3]
dataframe_concat = dataframe_concat[columns]
output: pd.DataFrame = dataframe_concat
output
```
%% Output
sample filename label 0 1 \
0 0 classical_8.mp3 classical -513.835449 0.000000
1 1 classical_8.mp3 classical -430.772858 99.951447
2 2 classical_8.mp3 classical -312.093567 159.784668
3 3 classical_8.mp3 classical -243.798019 168.200287
4 4 classical_8.mp3 classical -250.946625 182.020203
... ... ... ... ... ...
2581 2581 electronic_28.mp3 electronic -4.531759 85.749336
2582 2582 electronic_28.mp3 electronic -21.892481 64.973923
2583 2583 electronic_28.mp3 electronic -26.937489 59.654442
2584 2584 electronic_28.mp3 electronic -37.675701 69.980713
2585 2585 electronic_28.mp3 electronic -69.959473 90.579102
2 3 4 5 6 ... 30 \
0 0.000000 0.000000 0.000000 0.000000 0.000000 ... 0.000000
1 61.102493 28.070032 15.340330 15.008282 11.502503 ... -4.017534
2 31.906086 25.901234 6.815042 3.911939 21.410465 ... 3.267372
3 16.092997 34.248627 3.439126 4.217156 16.333824 ... 8.645699
4 12.093463 31.393484 10.792284 5.874646 15.635584 ... 6.143005
... ... ... ... ... ... ... ...
2581 3.175902 29.282883 10.520454 28.353235 7.040113 ... -0.076582
2582 0.638062 30.259424 3.547897 25.982525 12.492319 ... -4.140548
2583 3.198796 36.822197 -0.308186 17.223629 12.519827 ... -2.150106
2584 6.486831 36.693054 -2.817516 14.450989 9.200117 ... 0.592433
2585 12.684738 39.559166 -2.489999 13.447134 2.889965 ... 2.153978
31 32 33 34 35 36 \
0 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
1 -2.689229 -2.293572 -2.991963 -3.644343 -4.003089 -4.528318
2 -2.944059 -7.677339 -3.628831 -4.110184 -14.840838 -3.495162
3 -5.766571 -5.486410 -3.288999 -3.853479 -19.015926 -7.971353
4 -2.007963 -7.107271 -5.137182 -7.456434 -19.914568 -8.567856
... ... ... ... ... ... ...
2581 10.373774 -3.842222 1.740638 -4.820115 5.424960 -0.350912
2582 8.154976 -8.581367 0.991196 -7.903484 5.064352 -7.015607
2583 6.751756 -8.335445 -3.181783 -11.748012 3.223699 -10.738268
2584 4.523458 -8.737437 -4.725236 -7.613096 1.976833 -9.998651
2585 6.035127 -8.183851 -0.212283 -1.487655 -2.779953 -5.455588
37 38 39
0 0.000000 0.000000 0.000000
1 -4.626081 -2.798346 0.923011
2 8.776964 -4.981813 -10.156776
3 9.408128 -3.466177 -11.191519
4 4.395530 -5.535549 -9.764086
... ... ... ...
2581 3.484543 4.927905 7.667750
2582 2.761323 2.499545 4.854020
2583 -1.915628 -2.164130 -0.500030
2584 -1.651334 -1.831298 -1.857335
2585 0.809570 -1.209018 -1.631956
[1029854 rows x 43 columns]
%% Cell type:code id:0abf745b tags:
``` python
# outputs
OUTPUT_PATH.mkdir(parents=True, exist_ok=True)
output.to_csv(OUTPUT_PATHS["raw_features"], index=False)
```
......
%% Cell type:markdown id:f48a4573 tags:
# Aggregate MFCC Features
Aggregate from n rows par file to 1 (calculate min, max, etc. for each feature).
%% Cell type:code id:389576b8 tags:
``` python
from pathlib import Path
import pandas as pd
from definitions import BASE_PATH
```
%% Cell type:code id:26f640e0 tags:parameters
``` python
INPUT_PATH = BASE_PATH / "tmp" / "3_aggregate_features" / "input"
OUTPUT_PATH = BASE_PATH / "tmp" / "3_aggregate_features" / "output"
INPUT_PATHS: dict[str, str] = {
"raw_features": (INPUT_PATH / "raw_features.csv").__str__()
}
OUTPUT_PATHS: dict[str, str] = {
"features": (OUTPUT_PATH / "features.csv").__str__()
}
```
%% Cell type:code id:88ecee07 tags:injected-parameters
%% Cell type:code id:40dbf7fa tags:injected-parameters
``` python
# Parameters
INPUT_PATHS = {
"raw_features": "/home/lukas/Programming/uni/bachelorarbeit/fairnb/tmp/3_aggregate_features/input/raw_features.csv"
"raw_features": "/home/lukas/Programming/uni/bachelorarbeit/dbrepo-ismir/tmp/3_aggregate_features/input/raw_features.csv"
}
OUTPUT_PATHS = {
"aggregated_features": "/home/lukas/Programming/uni/bachelorarbeit/fairnb/tmp/3_aggregate_features/output/features.csv"
"aggregated_features": "/home/lukas/Programming/uni/bachelorarbeit/dbrepo-ismir/tmp/3_aggregate_features/output/features.csv"
}
```
%% Cell type:code id:c5d9d980 tags:
``` python
# inputs
raw_features = pd.read_csv(INPUT_PATHS["raw_features"], index_col=False)
```
%% Cell type:code id:99f75f47 tags:
``` python
meta_columns = ["sample", "filename", "label"]
mfcc_aggregated = raw_features\
.drop(meta_columns, axis=1, errors='ignore')\
.groupby(raw_features.filename).agg(['min', 'max', 'mean', 'std', 'skew'])
mfcc_meta = pd.DataFrame(raw_features['label'].groupby(raw_features.filename).last())
mfcc_meta.columns = pd.MultiIndex.from_arrays([['label'], ['']]) # needed for merge
mfcc_merged = pd.merge(mfcc_meta, mfcc_aggregated, left_index=True, right_index=True)
# reduce multi index to single index
one_level_cols = ['_'.join([str(el) for el in col]) for col in mfcc_merged.columns[1:]]
one_level_cols.insert(0, "label")
mfcc_merged.columns = pd.Index(one_level_cols)
mfcc_merged = mfcc_merged.reset_index()
mfcc_merged
```
%% Output
filename label 0_min 0_max 0_mean \
0 classical_1.mp3 classical -530.78436 -163.308350 -302.203167
1 classical_10.mp3 classical -562.85785 -96.164795 -219.259016
2 classical_100.mp3 classical -536.23737 -61.608826 -177.804114
3 classical_11.mp3 classical -536.45746 -120.429665 -222.126303
4 classical_12.mp3 classical -562.67523 -148.133560 -270.975406
.. ... ... ... ... ...
395 rock_95.mp3 rock -553.11010 -5.218835 -193.506047
396 rock_96.mp3 rock -541.23600 27.163334 -119.113996
397 rock_97.mp3 rock -518.49500 58.526745 -66.267744
398 rock_98.mp3 rock -518.64307 53.555115 -45.734517
399 rock_99.mp3 rock -544.70310 75.612130 -49.380943
0_std 0_skew 1_min 1_max 1_mean ... 38_min \
0 51.142183 -0.468374 0.000000 178.75162 111.332342 ... -44.098070
1 53.561838 -0.772320 0.029056 259.63270 215.094182 ... -27.458416
2 83.381622 -2.587179 0.000000 190.47589 112.471713 ... -27.335688
3 76.246992 -2.402418 0.000000 159.42575 99.853645 ... -31.774948
4 52.191182 -0.366586 0.000000 194.26416 148.226647 ... -44.843810
.. ... ... ... ... ... ... ...
395 76.869437 -0.201055 -89.948746 201.18045 111.724191 ... -27.043941
396 58.420684 -0.957699 -7.415961 210.49246 125.453699 ... -37.584858
397 65.635619 -0.898026 -58.824410 175.20135 99.288265 ... -29.620445
398 52.444200 -1.705641 0.000000 187.04274 96.440874 ... -26.967848
399 54.045627 -0.863093 -32.930653 191.73538 93.971242 ... -21.929403
38_max 38_mean 38_std 38_skew 39_min 39_max 39_mean \
0 47.308060 -3.713503 16.553984 0.230691 -46.794480 49.352516 -2.282116
1 29.811110 0.484271 8.660648 -0.479016 -28.989983 27.533710 0.952658
2 27.610388 -0.333233 8.185075 0.208425 -38.095375 31.397880 -1.494916
3 31.500881 -3.781627 9.191043 0.260886 -22.667440 50.992897 1.600777
4 28.490644 -6.242015 10.546545 0.341848 -25.040888 46.878204 1.844494
.. ... ... ... ... ... ... ...
395 22.451445 -7.234634 8.471853 0.753855 -24.712723 23.410387 -4.502398
396 28.087936 -9.704238 8.447620 0.112760 -38.147890 21.814402 -8.249507
397 26.325895 -5.722825 7.727378 0.207489 -29.497524 25.410654 -3.356614
398 8.714737 -9.511491 5.551820 -0.025604 -23.020084 13.948638 -2.664985
399 17.050608 -5.296691 5.894963 0.390705 -20.983192 29.312023 -0.321836
39_std 39_skew
0 15.285639 0.171462
1 10.477735 -0.185771
2 10.917299 0.020985
3 10.125545 0.595763
4 11.160392 0.503120
.. ... ...
395 6.687984 0.238807
396 7.807756 0.071968
397 8.170526 0.160330
398 5.051498 -0.258407
399 6.571660 0.384794
[400 rows x 202 columns]
%% Cell type:code id:4ac5c765 tags:
``` python
# outputs
aggregated_features_path = Path(OUTPUT_PATHS["aggregated_features"]).resolve()
aggregated_features_path.parent.mkdir(parents=True, exist_ok=True)
output = mfcc_merged
output.to_csv(aggregated_features_path, index=False)
```
......
%% Cell type:markdown id:e92b4fe9 tags:
# Split the Features into Train and Test Set
%% Cell type:code id:5f1fae44 tags:
``` python
import pandas as pd
from pathlib import Path
from definitions import BASE_PATH
```
%% Cell type:code id:01de1b27 tags:parameters
``` python
# Tagged with 'parameters'
from definitions import BASE_PATH
INPUT_PATHS: dict[str, str] = {
"features": (BASE_PATH / "tmp" / "4_split" / "input" / "features.csv").__str__()
}
OUTPUT_PATHS: dict[str, str] = {
"split": (BASE_PATH / "tmp" / "4_split" / "output" / "split.csv").__str__()
}
```
%% Cell type:code id:e99ca0ba tags:injected-parameters
%% Cell type:code id:fdc0a0a6 tags:injected-parameters
``` python
# Parameters
INPUT_PATHS = {
"aggregated_features": "/home/lukas/Programming/uni/bachelorarbeit/fairnb/tmp/4_split/input/features.csv"
"aggregated_features": "/home/lukas/Programming/uni/bachelorarbeit/dbrepo-ismir/tmp/4_split/input/features.csv"
}
OUTPUT_PATHS = {
"split": "/home/lukas/Programming/uni/bachelorarbeit/fairnb/tmp/4_split/output/split.csv"
"split": "/home/lukas/Programming/uni/bachelorarbeit/dbrepo-ismir/tmp/4_split/output/split.csv"
}
```
%% Cell type:code id:a4cc6800 tags:
``` python
# INPUT
for path in INPUT_PATHS.values():
assert Path(path).exists()
features = pd.read_csv(INPUT_PATHS["aggregated_features"])
```
%% Cell type:code id:a186d0c4 tags:
``` python
train = features.sample(frac=0.8).sort_index()
train = features.sample(frac=0.8, random_state=11908553).sort_index()
test = features.drop(train.index)
split_true = pd.DataFrame({
"filename": train.filename,
"train": True
})
split_false = pd.DataFrame({
"filename": test.filename,
"train": False
})
split_concat = pd.concat([split_true, split_false])\
.sort_values("filename")\
.reset_index(drop=True)
```
%% Cell type:code id:091e0641 tags:
``` python
split_concat
```
%% Output
filename train
0 classical_1.mp3 False
1 classical_10.mp3 True
2 classical_100.mp3 False
3 classical_11.mp3 True
4 classical_12.mp3 True
.. ... ...
395 rock_95.mp3 False
396 rock_96.mp3 True
397 rock_97.mp3 True
398 rock_98.mp3 True
399 rock_99.mp3 True
[400 rows x 2 columns]
%% Cell type:code id:7b11b8bb tags:
``` python
# output
OUTPUT_PATH = Path(OUTPUT_PATHS["split"])
OUTPUT_PATH.parent.mkdir(parents=True, exist_ok=True)
output = split_concat
output.to_csv(OUTPUT_PATH, index=False)
```
......
%% Cell type:markdown id:5de30442 tags:
# ML Experiment code
# Inputs: splits & aggregated features
%% Cell type:code id:a2eb8998 tags:
``` python
import pickle
from pathlib import Path
import numpy as np
import pandas as pd
from pandas import DataFrame, Index
from sklearn.decomposition import PCA
from sklearn.metrics import accuracy_score
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
import seaborn as sns
import matplotlib.pyplot as plt
from definitions import BASE_PATH
```
%% Cell type:code id:8a8da20f tags:parameters
``` python
# Tagged with 'parameters'
INPUT_PATH = BASE_PATH / "tmp" / "5_ml_model" / "input"
OUTPUT_PATH = BASE_PATH / "tmp" / "5_ml_model" / "output"
INPUT_PATHS: dict[str, str] = {
"split": (INPUT_PATH / "split.csv").__str__(),
"features": (INPUT_PATH / "features.csv").__str__()
}
OUTPUT_PATHS: dict[str, str] = {
"submission": (OUTPUT_PATH / "submission.csv").__str__(),
"clf": (OUTPUT_PATH / "clf.pickle").__str__()
}
```
%% Cell type:code id:08b56684 tags:injected-parameters
%% Cell type:code id:1229e75d tags:injected-parameters
``` python
# Parameters
INPUT_PATHS = {
"split": "/home/lukas/Programming/uni/bachelorarbeit/fairnb/tmp/5_ml_model/input/split.csv",
"aggregated_features": "/home/lukas/Programming/uni/bachelorarbeit/fairnb/tmp/5_ml_model/input/features.csv",
"split": "/home/lukas/Programming/uni/bachelorarbeit/dbrepo-ismir/tmp/5_ml_model/input/split.csv",
"aggregated_features": "/home/lukas/Programming/uni/bachelorarbeit/dbrepo-ismir/tmp/5_ml_model/input/features.csv",
}
OUTPUT_PATHS = {
"clf": "/home/lukas/Programming/uni/bachelorarbeit/fairnb/tmp/5_ml_model/output/ml_model.pickle",
"submission": "/home/lukas/Programming/uni/bachelorarbeit/fairnb/tmp/5_ml_model/output/test_result.csv",
"clf": "/home/lukas/Programming/uni/bachelorarbeit/dbrepo-ismir/tmp/5_ml_model/output/ml_model.pickle",
"submission": "/home/lukas/Programming/uni/bachelorarbeit/dbrepo-ismir/tmp/5_ml_model/output/test_result.csv",
}
```
%% Cell type:code id:6810272a tags:
``` python
# input
split: pd.DataFrame = pd.read_csv(INPUT_PATHS["split"])
features: pd.DataFrame = pd.read_csv(INPUT_PATHS["aggregated_features"])
```
%% Cell type:code id:36f06fd6 tags:
``` python
joined = pd.merge(features, split, on="filename").set_index("filename")
joined
```
%% Output
label 0_min 0_max 0_mean 0_std \
filename
classical_1.mp3 classical -530.78436 -163.308350 -302.203167 51.142183
classical_10.mp3 classical -562.85785 -96.164795 -219.259016 53.561838
classical_100.mp3 classical -536.23737 -61.608826 -177.804114 83.381622
classical_11.mp3 classical -536.45746 -120.429665 -222.126303 76.246992
classical_12.mp3 classical -562.67523 -148.133560 -270.975406 52.191182
... ... ... ... ... ...
rock_95.mp3 rock -553.11010 -5.218835 -193.506047 76.869437
rock_96.mp3 rock -541.23600 27.163334 -119.113996 58.420684
rock_97.mp3 rock -518.49500 58.526745 -66.267744 65.635619
rock_98.mp3 rock -518.64307 53.555115 -45.734517 52.444200
rock_99.mp3 rock -544.70310 75.612130 -49.380943 54.045627
0_skew 1_min 1_max 1_mean 1_std ... \
filename ...
classical_1.mp3 -0.468374 0.000000 178.75162 111.332342 24.847563 ...
classical_10.mp3 -0.772320 0.029056 259.63270 215.094182 18.388131 ...
classical_100.mp3 -2.587179 0.000000 190.47589 112.471713 27.277553 ...
classical_11.mp3 -2.402418 0.000000 159.42575 99.853645 21.916949 ...
classical_12.mp3 -0.366586 0.000000 194.26416 148.226647 19.305008 ...
... ... ... ... ... ... ...
rock_95.mp3 -0.201055 -89.948746 201.18045 111.724191 36.463584 ...
rock_96.mp3 -0.957699 -7.415961 210.49246 125.453699 31.908869 ...
rock_97.mp3 -0.898026 -58.824410 175.20135 99.288265 25.158416 ...
rock_98.mp3 -1.705641 0.000000 187.04274 96.440874 24.137702 ...
rock_99.mp3 -0.863093 -32.930653 191.73538 93.971242 33.410220 ...
38_max 38_mean 38_std 38_skew 39_min \
filename
classical_1.mp3 47.308060 -3.713503 16.553984 0.230691 -46.794480
classical_10.mp3 29.811110 0.484271 8.660648 -0.479016 -28.989983
classical_100.mp3 27.610388 -0.333233 8.185075 0.208425 -38.095375
classical_11.mp3 31.500881 -3.781627 9.191043 0.260886 -22.667440
classical_12.mp3 28.490644 -6.242015 10.546545 0.341848 -25.040888
... ... ... ... ... ...
rock_95.mp3 22.451445 -7.234634 8.471853 0.753855 -24.712723
rock_96.mp3 28.087936 -9.704238 8.447620 0.112760 -38.147890
rock_97.mp3 26.325895 -5.722825 7.727378 0.207489 -29.497524
rock_98.mp3 8.714737 -9.511491 5.551820 -0.025604 -23.020084
rock_99.mp3 17.050608 -5.296691 5.894963 0.390705 -20.983192
39_max 39_mean 39_std 39_skew train
filename
classical_1.mp3 49.352516 -2.282116 15.285639 0.171462 True
classical_10.mp3 27.533710 0.952658 10.477735 -0.185771 True
classical_100.mp3 31.397880 -1.494916 10.917299 0.020985 True
classical_11.mp3 50.992897 1.600777 10.125545 0.595763 True
classical_12.mp3 46.878204 1.844494 11.160392 0.503120 False
... ... ... ... ... ...
rock_95.mp3 23.410387 -4.502398 6.687984 0.238807 True
rock_96.mp3 21.814402 -8.249507 7.807756 0.071968 True
rock_97.mp3 25.410654 -3.356614 8.170526 0.160330 True
rock_98.mp3 13.948638 -2.664985 5.051498 -0.258407 True
rock_99.mp3 29.312023 -0.321836 6.571660 0.384794 True
[400 rows x 202 columns]
%% Cell type:code id:265d042f tags:
``` python
train: DataFrame = joined[joined["train"] == True].drop("train", axis=1)
train
```
%% Output
label 0_min 0_max 0_mean 0_std \
filename
classical_1.mp3 classical -530.78436 -163.308350 -302.203167 51.142183
classical_10.mp3 classical -562.85785 -96.164795 -219.259016 53.561838
classical_100.mp3 classical -536.23737 -61.608826 -177.804114 83.381622
classical_11.mp3 classical -536.45746 -120.429665 -222.126303 76.246992
classical_13.mp3 classical -637.72064 -177.713960 -361.834032 71.310080
... ... ... ... ... ...
rock_95.mp3 rock -553.11010 -5.218835 -193.506047 76.869437
rock_96.mp3 rock -541.23600 27.163334 -119.113996 58.420684
rock_97.mp3 rock -518.49500 58.526745 -66.267744 65.635619
rock_98.mp3 rock -518.64307 53.555115 -45.734517 52.444200
rock_99.mp3 rock -544.70310 75.612130 -49.380943 54.045627
0_skew 1_min 1_max 1_mean 1_std ... \
filename ...
classical_1.mp3 -0.468374 0.000000 178.75162 111.332342 24.847563 ...
classical_10.mp3 -0.772320 0.029056 259.63270 215.094182 18.388131 ...
classical_100.mp3 -2.587179 0.000000 190.47589 112.471713 27.277553 ...
classical_11.mp3 -2.402418 0.000000 159.42575 99.853645 21.916949 ...
classical_13.mp3 0.008325 0.000000 257.16284 211.556558 20.347034 ...
... ... ... ... ... ... ...
rock_95.mp3 -0.201055 -89.948746 201.18045 111.724191 36.463584 ...
rock_96.mp3 -0.957699 -7.415961 210.49246 125.453699 31.908869 ...
rock_97.mp3 -0.898026 -58.824410 175.20135 99.288265 25.158416 ...
rock_98.mp3 -1.705641 0.000000 187.04274 96.440874 24.137702 ...
rock_99.mp3 -0.863093 -32.930653 191.73538 93.971242 33.410220 ...
38_min 38_max 38_mean 38_std 38_skew \
filename
classical_1.mp3 -44.098070 47.308060 -3.713503 16.553984 0.230691
classical_10.mp3 -27.458416 29.811110 0.484271 8.660648 -0.479016
classical_100.mp3 -27.335688 27.610388 -0.333233 8.185075 0.208425
classical_11.mp3 -31.774948 31.500881 -3.781627 9.191043 0.260886
classical_13.mp3 -24.728806 18.424036 -0.275736 7.026148 -0.640964
... ... ... ... ... ...
rock_95.mp3 -27.043941 22.451445 -7.234634 8.471853 0.753855
rock_96.mp3 -37.584858 28.087936 -9.704238 8.447620 0.112760
rock_97.mp3 -29.620445 26.325895 -5.722825 7.727378 0.207489
rock_98.mp3 -26.967848 8.714737 -9.511491 5.551820 -0.025604
rock_99.mp3 -21.929403 17.050608 -5.296691 5.894963 0.390705
39_min 39_max 39_mean 39_std 39_skew
filename
classical_1.mp3 -46.794480 49.352516 -2.282116 15.285639 0.171462
classical_10.mp3 -28.989983 27.533710 0.952658 10.477735 -0.185771
classical_100.mp3 -38.095375 31.397880 -1.494916 10.917299 0.020985
classical_11.mp3 -22.667440 50.992897 1.600777 10.125545 0.595763
classical_13.mp3 -24.319565 18.439262 -2.147022 8.171929 0.009566
... ... ... ... ... ...
rock_95.mp3 -24.712723 23.410387 -4.502398 6.687984 0.238807
rock_96.mp3 -38.147890 21.814402 -8.249507 7.807756 0.071968
rock_97.mp3 -29.497524 25.410654 -3.356614 8.170526 0.160330
rock_98.mp3 -23.020084 13.948638 -2.664985 5.051498 -0.258407
rock_99.mp3 -20.983192 29.312023 -0.321836 6.571660 0.384794
[320 rows x 201 columns]
%% Cell type:code id:1649ce52 tags:
``` python
test: DataFrame = joined[joined["train"] == False].drop("train", axis=1)
test
```
%% Output
label 0_min 0_max 0_mean 0_std \
filename
classical_12.mp3 classical -562.67523 -148.133560 -270.975406 52.191182
classical_2.mp3 classical -549.40650 -192.532060 -293.008969 27.207028
classical_20.mp3 classical -605.99150 -161.119310 -263.483084 49.157298
classical_27.mp3 classical -595.41895 -78.118810 -265.344461 104.892303
classical_39.mp3 classical -578.84720 -55.479320 -183.753039 69.140628
... ... ... ... ... ...
rock_85.mp3 rock -556.08203 44.890602 -72.618399 80.272023
rock_86.mp3 rock -534.40650 42.919650 -93.601685 62.192619
rock_88.mp3 rock -539.97880 44.375150 -126.955020 88.140999
rock_92.mp3 rock -532.89110 13.948147 -206.891688 80.812274
rock_93.mp3 rock -570.46650 -26.067888 -302.483118 96.569376
0_skew 1_min 1_max 1_mean 1_std ... \
filename ...
classical_12.mp3 -0.366586 0.000000 194.26416 148.226647 19.305008 ...
classical_2.mp3 -0.426848 0.000000 231.03738 198.662514 14.957660 ...
classical_20.mp3 -0.856221 0.000000 191.92676 141.393817 17.754779 ...
classical_27.mp3 -0.526604 0.000000 200.61633 144.208488 25.198761 ...
classical_39.mp3 -0.577055 0.000000 193.84949 127.058496 29.295691 ...
... ... ... ... ... ... ...
rock_85.mp3 -2.269420 -13.219891 205.14955 96.863927 38.352424 ...
rock_86.mp3 -0.869415 0.000000 206.32501 128.047509 30.374850 ...
rock_88.mp3 -1.700578 -19.007393 201.99960 99.760978 32.572320 ...
rock_92.mp3 0.090286 -47.724570 179.76506 109.954998 37.880477 ...
rock_93.mp3 0.159026 -89.999680 211.88910 103.686365 40.373592 ...
38_min 38_max 38_mean 38_std 38_skew \
filename
classical_12.mp3 -44.843810 28.490644 -6.242015 10.546545 0.341848
classical_2.mp3 -25.912933 24.293318 0.746096 8.240027 -0.022513
classical_20.mp3 -24.911243 38.551230 -2.274261 9.671005 0.719436
classical_27.mp3 -28.797087 20.897750 -5.761607 7.108055 0.360305
classical_39.mp3 -48.678460 24.566566 -7.810246 11.568188 -0.106704
... ... ... ... ... ...
rock_85.mp3 -22.633102 13.513550 -3.126545 5.035097 -0.035805
rock_86.mp3 -30.471783 20.564953 -3.383356 6.405211 -0.185147
rock_88.mp3 -34.726500 26.706833 -5.827121 8.260717 0.275225
rock_92.mp3 -37.614220 21.420666 -8.287362 7.851784 -0.080285
rock_93.mp3 -28.903786 35.712753 2.073339 10.995769 0.249798
39_min 39_max 39_mean 39_std 39_skew
filename
classical_12.mp3 -25.040888 46.878204 1.844494 11.160392 0.503120
classical_2.mp3 -18.561390 23.484133 3.115819 7.220346 0.242364
classical_20.mp3 -30.311798 29.272330 0.289613 9.590299 -0.244191
classical_27.mp3 -39.705540 25.803795 -2.736776 10.101577 -0.463730
classical_39.mp3 -24.328775 40.172250 -0.078006 10.646963 0.492488
... ... ... ... ... ...
rock_85.mp3 -19.814285 18.576450 -1.172361 6.078238 -0.048851
rock_86.mp3 -28.917618 26.702751 -1.950565 6.725107 -0.253487
rock_88.mp3 -31.036520 27.423218 -4.715363 6.544117 0.184718
rock_92.mp3 -41.547260 25.628895 -9.046777 8.779821 0.071449
rock_93.mp3 -30.178170 30.612560 -4.677735 8.877041 0.149639
[80 rows x 201 columns]
%% Cell type:code id:1e904bf3 tags:
%% Cell type:code id:1c01673464cb048e tags:
``` python
# remove labels
X = train.drop(['label'], axis=1, errors='ignore')
columns: Index = X.columns
classnames = np.sort(np.unique(joined.label.values)) # -> ["classical", "electronic", "pop", "rock"]
# map classname to an index and create dicts for easy lookup in O(1)
classname2index = {}
index2classname = {}
for i, classname in enumerate(classnames):
classname2index[classname] = i
index2classname[i] = classname
# map label to label index
y = np.array([classname2index[classname] for classname in train.label.values])
(X, y)
```
%% Output
( 0_min 0_max 0_mean 0_std 0_skew \
filename
classical_1.mp3 -530.78436 -163.308350 -302.203167 51.142183 -0.468374
classical_10.mp3 -562.85785 -96.164795 -219.259016 53.561838 -0.772320
classical_100.mp3 -536.23737 -61.608826 -177.804114 83.381622 -2.587179
classical_11.mp3 -536.45746 -120.429665 -222.126303 76.246992 -2.402418
classical_13.mp3 -637.72064 -177.713960 -361.834032 71.310080 0.008325
... ... ... ... ... ...
rock_95.mp3 -553.11010 -5.218835 -193.506047 76.869437 -0.201055
rock_96.mp3 -541.23600 27.163334 -119.113996 58.420684 -0.957699
rock_97.mp3 -518.49500 58.526745 -66.267744 65.635619 -0.898026
rock_98.mp3 -518.64307 53.555115 -45.734517 52.444200 -1.705641
rock_99.mp3 -544.70310 75.612130 -49.380943 54.045627 -0.863093
1_min 1_max 1_mean 1_std 1_skew ... \
filename ...
classical_1.mp3 0.000000 178.75162 111.332342 24.847563 -0.402642 ...
classical_10.mp3 0.029056 259.63270 215.094182 18.388131 -1.528751 ...
classical_100.mp3 0.000000 190.47589 112.471713 27.277553 -1.318523 ...
classical_11.mp3 0.000000 159.42575 99.853645 21.916949 -1.176922 ...
classical_13.mp3 0.000000 257.16284 211.556558 20.347034 -1.050119 ...
... ... ... ... ... ... ...
rock_95.mp3 -89.948746 201.18045 111.724191 36.463584 -0.443224 ...
rock_96.mp3 -7.415961 210.49246 125.453699 31.908869 -0.547469 ...
rock_97.mp3 -58.824410 175.20135 99.288265 25.158416 -0.568057 ...
rock_98.mp3 0.000000 187.04274 96.440874 24.137702 -0.145217 ...
rock_99.mp3 -32.930653 191.73538 93.971242 33.410220 0.040113 ...
38_min 38_max 38_mean 38_std 38_skew \
filename
classical_1.mp3 -44.098070 47.308060 -3.713503 16.553984 0.230691
classical_10.mp3 -27.458416 29.811110 0.484271 8.660648 -0.479016
classical_100.mp3 -27.335688 27.610388 -0.333233 8.185075 0.208425
classical_11.mp3 -31.774948 31.500881 -3.781627 9.191043 0.260886
classical_13.mp3 -24.728806 18.424036 -0.275736 7.026148 -0.640964
... ... ... ... ... ...
rock_95.mp3 -27.043941 22.451445 -7.234634 8.471853 0.753855
rock_96.mp3 -37.584858 28.087936 -9.704238 8.447620 0.112760
rock_97.mp3 -29.620445 26.325895 -5.722825 7.727378 0.207489
rock_98.mp3 -26.967848 8.714737 -9.511491 5.551820 -0.025604
rock_99.mp3 -21.929403 17.050608 -5.296691 5.894963 0.390705
39_min 39_max 39_mean 39_std 39_skew
filename
classical_1.mp3 -46.794480 49.352516 -2.282116 15.285639 0.171462
classical_10.mp3 -28.989983 27.533710 0.952658 10.477735 -0.185771
classical_100.mp3 -38.095375 31.397880 -1.494916 10.917299 0.020985
classical_11.mp3 -22.667440 50.992897 1.600777 10.125545 0.595763
classical_13.mp3 -24.319565 18.439262 -2.147022 8.171929 0.009566
... ... ... ... ... ...
rock_95.mp3 -24.712723 23.410387 -4.502398 6.687984 0.238807
rock_96.mp3 -38.147890 21.814402 -8.249507 7.807756 0.071968
rock_97.mp3 -29.497524 25.410654 -3.356614 8.170526 0.160330
rock_98.mp3 -23.020084 13.948638 -2.664985 5.051498 -0.258407
rock_99.mp3 -20.983192 29.312023 -0.321836 6.571660 0.384794
[320 rows x 200 columns],
array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3]))
%% Cell type:code id:32e5e889 tags:
%% Cell type:code id:41ce60fbed0a23bc tags:
``` python
X_test = test.drop(['label'], axis=1, errors='ignore')
print(X.shape)
print(X_test.shape)
print(X_test.shape[0] / X.shape[0]) # fraction of test sample
y_test = np.array([classname2index[classname] for classname in test.label.values])
y_test
```
%% Output
(320, 200)
(80, 200)
0.25
array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2,
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3,
3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3])
%% Cell type:code id:e165922f tags:
%% Cell type:code id:99dc29024df3d251 tags:
``` python
# Standardize for PCA
scaler = StandardScaler()
X_standardized = scaler.fit_transform(X.values)
X_test_standardized = scaler.transform(X_test.values)
X_standardized
```
%% Output
array([[ 0.38209988, -1.79901606, -1.34294124, ..., -0.7312519 ,
3.4358529 , 0.11530124],
[-0.42728837, -0.93236007, -0.41652953, ..., 0.22563011,
1.37555438, -0.86835549],
[ 0.24449084, -0.48632861, 0.04648451, ..., -0.49838941,
1.56391778, -0.29904453],
...,
[ 0.69222714, 1.06432227, 1.29224565, ..., -1.0491004 ,
0.38686173, 0.08464998],
[ 0.68849053, 1.00015092, 1.52158336, ..., -0.84450893,
-0.94971424, -1.06836048],
[ 0.03085452, 1.28485202, 1.48085606, ..., -0.15137928,
-0.29828957, 0.70271937]])
%% Cell type:code id:d389fd70 tags:
%% Cell type:code id:3f30e11dc4688246 tags:
``` python
# Reduce Dimensions via PCA
pca = PCA(n_components=50).fit(X_standardized)
pca = PCA(n_components=30).fit(X_standardized)
X_pca = pca.transform(X_standardized)
X_test_pca = pca.transform(X_test_standardized)
print(sum(pca.explained_variance_ratio_))
print(X_pca.shape)
print(X_test_pca.shape)
print(y.shape)
```
%% Output
0.8557392011152061
(320, 50)
(80, 50)
(320,)
%% Cell type:code id:aa1d9036 tags:
%% Cell type:code id:21bf974f979ae1f4 tags:
``` python
# Fit SVM:
X_train, X_val, y_train, y_val = train_test_split(X_pca, y, test_size = 0.2, random_state=4, shuffle = True)
clf = SVC(kernel='rbf', probability=True)
clf.fit(X_train, y_train)
print(accuracy_score(clf.predict(X_val), y_val))
print(X_val)
print(y_val)
```
%% Output
0.6875
[[-4.64558613 3.08838305 -1.47175688 ... -1.24828691 -0.70095473
0.01689286]
[ 5.85968202 -2.1047151 -3.35419664 ... -1.48822402 1.00205068
-0.98882563]
[ 6.52471238 -2.88386219 -5.91379963 ... 0.08618421 0.03366275
-0.55189302]
...
[ 5.3496866 3.90245458 -4.07128854 ... -0.82356091 -0.7968544
0.26045289]
[ 6.68981697 -1.18340439 -0.12267599 ... 1.33593613 -2.8015435
0.5028293 ]
[-4.78063681 -7.16377441 4.09506551 ... -1.0308011 0.83671387
-0.07027211]]
[3 0 3 2 3 0 1 2 0 3 0 0 0 1 2 1 2 3 1 1 1 0 3 0 0 0 3 1 1 3 3 2 3 1 2 1 0
1 0 1 3 0 0 0 0 3 3 3 0 3 3 3 1 2 2 0 1 2 1 2 3 2 1 0]
%% Cell type:code id:fc48c86e tags:
%% Cell type:code id:6099c8ae2b4be921 tags:
``` python
# grid for C, gamma
C_grid = [0.001, 0.01, 0.1, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
C_grid = [0.0001, 0.001, 0.01, 0.1, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
gamma_grid = [0.001, 0.01, 0.1, 1, 10]
param_grid = {'C': C_grid, 'gamma': gamma_grid}
grid = GridSearchCV(SVC(kernel='rbf'), param_grid, cv=5, scoring="accuracy")
grid.fit(X_train, y_train)
grid.fit(X_pca, y)
# Find the best model
print(grid.best_score_)
print(grid.best_params_)
print(grid.best_estimator_)
print(accuracy_score(grid.predict(X_val), y_val))
```
%% Output
0.7343891402714932
{'C': 3, 'gamma': 0.01}
SVC(C=3, gamma=0.01)
0.78125
%% Cell type:code id:7cf87469 tags:
%% Cell type:code id:43a8791efe8809f4 tags:
``` python
# Optimal model
# Fit entire training sets with optimal model
clf = SVC(kernel='rbf', C=4, gamma=0.01, probability=True)
clf.fit(X_train, y_train)
print(accuracy_score(clf.predict(X_val), y_val))
```
%% Output
0.78125
%% Cell type:code id:5a754cd1 tags:
``` python
# Fit entire training sets
clf.fit(X_pca, y)
proba = clf.predict_proba(X_test_pca)
print(accuracy_score(clf.predict(X_test_pca), y_test))
print(clf.predict_proba(X_test_pca))
print(f"Accuracy score: {accuracy_score(clf.predict(X_test_pca), y_test)}")
```
%% Output
0.8
[[9.60125451e-01 2.54410379e-02 1.00183548e-02 4.41515609e-03]
[9.93544791e-01 4.04634019e-03 1.20649558e-03 1.20237342e-03]
[9.97430192e-01 1.76800719e-04 5.38565546e-04 1.85444214e-03]
[9.79967977e-01 6.86113735e-03 9.68497114e-03 3.48591496e-03]
[9.91884967e-01 5.17290348e-03 1.26266158e-03 1.67946793e-03]
[9.85578464e-01 9.44992493e-03 3.75086068e-03 1.22075036e-03]
[2.04862989e-01 4.53621014e-01 1.34373358e-01 2.07142639e-01]
[9.99181855e-01 4.86945868e-04 2.22608725e-04 1.08590413e-04]
[9.92658119e-01 3.47218548e-03 2.74696376e-03 1.12273207e-03]
[9.99656357e-01 1.12727916e-04 1.43400994e-04 8.75138776e-05]
[8.47319131e-01 4.69014094e-02 7.09411516e-02 3.48383077e-02]
[1.28380278e-01 3.67332428e-01 3.59429595e-01 1.44857699e-01]
[9.96413445e-01 2.75890076e-03 4.65504357e-04 3.62150045e-04]
[9.98826125e-01 7.62447290e-04 3.01490088e-04 1.09937383e-04]
[9.99401836e-01 8.67850526e-05 3.74373911e-04 1.37005308e-04]
[9.97955498e-01 1.69931669e-03 1.73626292e-04 1.71558652e-04]
[8.45643860e-01 1.33426916e-02 9.97412359e-02 4.12722121e-02]
[9.82092462e-01 1.15346135e-02 3.19973757e-03 3.17318740e-03]
[9.83213850e-01 1.24420959e-02 3.26304918e-03 1.08100527e-03]
[9.99642856e-01 7.19184901e-05 1.55316717e-04 1.29908898e-04]
[9.97979494e-01 1.76870557e-03 1.31807873e-04 1.19992584e-04]
[4.92333515e-04 9.38096306e-01 2.10469538e-02 4.03644064e-02]
[9.45551189e-03 4.32699483e-01 4.16341606e-01 1.41503399e-01]
[9.13893710e-03 4.44229440e-01 3.15860710e-01 2.30770912e-01]
[6.79828415e-02 6.71681498e-01 2.09457159e-01 5.08785014e-02]
[1.68076034e-04 9.71769830e-01 2.24441690e-03 2.58176775e-02]
[5.73737808e-02 8.61494512e-02 5.86365884e-01 2.70110884e-01]
[1.18603200e-01 5.68582627e-01 2.33418558e-01 7.93956149e-02]
[1.11117289e-02 9.36048570e-01 2.07419839e-02 3.20977167e-02]
[4.27128683e-03 2.53015466e-01 4.52073691e-01 2.90639556e-01]
[8.49595708e-03 6.37021927e-01 1.52099758e-01 2.02382358e-01]
[9.29855946e-04 8.43628458e-01 1.67412440e-02 1.38700442e-01]
[5.75440080e-02 6.65893968e-01 1.18869183e-01 1.57692841e-01]
[7.28891949e-02 6.97755501e-01 1.23916666e-01 1.05438637e-01]
[1.00364172e-01 3.05951082e-01 4.02534596e-01 1.91150150e-01]
[2.71956862e-04 5.43067021e-01 1.43066793e-02 4.42354343e-01]
[8.60586155e-02 8.06134589e-02 6.12157762e-01 2.21170163e-01]
[4.54205646e-02 3.77922605e-02 7.46222645e-01 1.70564530e-01]
[2.60732219e-02 1.78887893e-01 3.03253706e-01 4.91785179e-01]
[1.76685545e-01 1.49702306e-01 5.30947449e-01 1.42664700e-01]
[2.10423538e-02 3.16261307e-02 6.86655601e-01 2.60675914e-01]
[5.10365555e-03 9.06077798e-03 3.10609892e-01 6.75225674e-01]
[1.85590659e-04 4.20187052e-01 2.54067881e-01 3.25559476e-01]
[1.84121015e-03 1.49368051e-03 5.94696830e-01 4.01968279e-01]
[9.94756099e-03 1.98337895e-02 6.10189918e-01 3.60028732e-01]
[1.06218859e-02 5.83443846e-02 4.09385718e-01 5.21648011e-01]
[2.51610276e-01 1.06475171e-01 4.02323327e-01 2.39591226e-01]
[1.05739190e-03 4.80039248e-03 7.84298209e-01 2.09844007e-01]
[1.20304373e-03 2.49929289e-03 4.25498367e-01 5.70799297e-01]
[5.17165422e-04 2.44187897e-03 7.70942808e-01 2.26098148e-01]
[1.48279902e-01 4.34212254e-01 3.33486768e-01 8.40210765e-02]
[6.49493657e-03 2.03203941e-03 6.76591245e-01 3.14881779e-01]
[1.42643647e-03 3.00507802e-02 7.66466942e-01 2.02055842e-01]
[2.71205953e-04 1.64674206e-03 5.18908081e-01 4.79173971e-01]
[6.18460044e-04 8.65733199e-03 7.31160871e-01 2.59563337e-01]
[5.99851686e-04 9.88068783e-03 3.18075020e-01 6.71444441e-01]
[8.92857719e-05 2.49912334e-03 8.22928402e-01 1.74483188e-01]
[4.08821963e-03 4.01685411e-03 2.22308630e-01 7.69586296e-01]
[3.85280110e-04 4.28844983e-03 4.38873417e-01 5.56452853e-01]
[7.77946831e-04 9.39309422e-03 1.89573855e-01 8.00255104e-01]
[1.07826925e-03 4.48667610e-03 1.68966113e-01 8.25468942e-01]
[4.32984844e-03 3.71263242e-02 1.74061879e-01 7.84481948e-01]
[8.91964233e-04 4.60229508e-03 2.56203571e-01 7.38302169e-01]
[1.53170345e-04 2.66905629e-03 8.05893086e-01 1.91284687e-01]
[3.76678169e-04 2.66687172e-02 1.35691366e-01 8.37263238e-01]
[1.87189571e-03 2.95477730e-02 1.83614398e-01 7.84965933e-01]
[3.65699757e-04 4.65723230e-02 1.96467002e-01 7.56594975e-01]
[3.91020418e-03 2.21215837e-02 3.46096170e-01 6.27872042e-01]
[3.53128321e-04 1.26062549e-03 4.04030924e-01 5.94355323e-01]
[3.85531972e-04 1.67060179e-03 5.14520249e-01 4.83423617e-01]
[4.01176053e-04 1.39364758e-03 5.62411421e-01 4.35793755e-01]
[2.19890976e-02 4.13933530e-01 3.17505597e-01 2.46571775e-01]
[2.63540892e-03 1.60423321e-02 1.69895446e-01 8.11426813e-01]
[5.95478507e-04 7.12069104e-04 9.01272706e-02 9.08565182e-01]
[2.56904495e-04 3.92709426e-03 3.41668674e-01 6.54147328e-01]
[3.34122792e-04 5.02991556e-03 3.01652248e-01 6.92983714e-01]
[1.74105457e-03 1.54657507e-02 2.27888902e-01 7.54904293e-01]
[3.34518377e-02 5.51052761e-02 3.32962366e-01 5.78480520e-01]
[1.16808056e-03 1.31231889e-03 1.63219289e-01 8.34300311e-01]
[8.88813523e-02 1.55465620e-01 3.86988580e-01 3.68664447e-01]]
%% Cell type:code id:bbd99cb8 tags:
``` python
# svc_path = BASE_PATH / "out" / "SVC"/ "clf.pickle"
# svc_path.parent.mkdir(parents=True, exist_ok=True)
#
# with open(svc_path, "wb") as file:
# pickle.dump(clf, file)
#
# with open(svc_path, "rb") as file:
# loaded = pickle.load(file)
# loaded.predict_proba(X_test_pca)
```
%% Cell type:code id:af3c36d2 tags:
%% Cell type:code id:28c779539faeb27c tags:
``` python
# Fit the entire training sets
def convert_to_labels(preds, i2c, k=3):
ans = []
ids = []
for p in preds:
idx = np.argsort(p)[::-1]
ids.append([i for i in idx[:k]])
ans.append([i2c[i] for i in idx[:k]])
return ans, ids
clf.fit(X_pca, y)
prediction_lists, percentage_lists = convert_to_labels(clf.predict_proba(X_test_pca), index2classname, k=4)
genres = ["classical", "electronic", "pop", "rock"]
# # Write to outputs
subm = pd.DataFrame(index=test.index)
subm['label'] = test.label.values
subm['pred1'] = [prediction_list[0] for prediction_list in prediction_lists]
subm['pred2'] = [prediction_list[1] for prediction_list in prediction_lists]
subm['pred3'] = [prediction_list[2] for prediction_list in prediction_lists]
subm['pred4'] = [prediction_list[3] for prediction_list in prediction_lists]
proba_df = pd.DataFrame(index=test.index)
proba_df['label'] = test.label.values
proba_df[genres[0]] = proba[:,0:1]
proba_df[genres[1]] = proba[:,1:2]
proba_df[genres[2]] = proba[:,2:3]
proba_df[genres[3]] = proba[:,3:4]
pd.set_option('display.max_rows', None)
print(subm)
# print(subm)
display(subm)
display(proba_df)
pd.reset_option('display.max_rows')
```
%% Output
%% Cell type:code id:a816521f533c6539 tags:
``` python
conf_matrix = pd.DataFrame(confusion_matrix(subm['label'], subm['pred1']), columns=genres, index=genres)
plt.figure(dpi=200)
display(sns.heatmap(conf_matrix, annot=True).set( xlabel="Prediction", ylabel="Actual"))
```
%% Cell type:code id:d2d7e5ef892ec807 tags:
``` python
subm_top_2 = subm.copy()
subm_top_2["top_2"] = subm.apply(lambda row: row.get("pred2") if row.get("label") == row.get("pred2") else row.get("pred1"), axis=1)
conf_matrix_top_2 = pd.DataFrame(confusion_matrix(subm['label'], subm_top_2["top_2"]), columns=genres, index=genres)
accuracy_score_top_2 = sum(sum(conf_matrix_top_2.values * np.identity(4))) / sum(sum(conf_matrix_top_2.values))
print(f"Accuracy for top 2 predictions: {accuracy_score_top_2}")
display(sns.heatmap(conf_matrix_top_2, annot=True).set( xlabel="Prediction", ylabel="Actual"))
```
%% Cell type:code id:4433589d09bda6e5 tags:
``` python
display(sns.heatmap(proba_df.corr(numeric_only=True), vmin=-1, vmax=1, annot=True).set(title="Correlation heatmap of prediction probabilities"))
```
%% Cell type:markdown id:209e3007ae290ede tags:
label pred1 pred2 pred3 pred4
filename
classical_12.mp3 classical classical electronic pop rock
classical_2.mp3 classical classical electronic pop rock
classical_20.mp3 classical classical rock pop electronic
classical_27.mp3 classical classical pop electronic rock
classical_39.mp3 classical classical electronic rock pop
classical_4.mp3 classical classical electronic pop rock
classical_40.mp3 classical electronic rock classical pop
classical_46.mp3 classical classical electronic pop rock
classical_47.mp3 classical classical electronic pop rock
classical_48.mp3 classical classical pop electronic rock
classical_49.mp3 classical classical pop electronic rock
classical_52.mp3 classical electronic pop rock classical
classical_54.mp3 classical classical electronic pop rock
classical_6.mp3 classical classical electronic pop rock
classical_62.mp3 classical classical pop rock electronic
classical_67.mp3 classical classical electronic pop rock
classical_69.mp3 classical classical pop rock electronic
classical_82.mp3 classical classical electronic pop rock
classical_9.mp3 classical classical electronic pop rock
classical_92.mp3 classical classical pop rock electronic
classical_94.mp3 classical classical electronic pop rock
electronic_11.mp3 electronic electronic rock pop classical
electronic_20.mp3 electronic electronic pop rock classical
electronic_21.mp3 electronic electronic pop rock classical
electronic_3.mp3 electronic electronic pop classical rock
electronic_35.mp3 electronic electronic rock pop classical
electronic_36.mp3 electronic pop rock electronic classical
electronic_38.mp3 electronic electronic pop classical rock
electronic_44.mp3 electronic electronic rock pop classical
electronic_49.mp3 electronic pop rock electronic classical
electronic_55.mp3 electronic electronic rock pop classical
electronic_59.mp3 electronic electronic rock pop classical
electronic_61.mp3 electronic electronic rock pop classical
electronic_62.mp3 electronic electronic pop rock classical
electronic_63.mp3 electronic pop electronic rock classical
electronic_81.mp3 electronic electronic rock pop classical
pop_1.mp3 pop pop rock electronic classical
pop_10.mp3 pop pop rock classical electronic
pop_100.mp3 pop rock pop electronic classical
pop_25.mp3 pop pop classical electronic rock
pop_32.mp3 pop pop rock electronic classical
pop_38.mp3 pop rock pop electronic classical
pop_39.mp3 pop electronic rock pop classical
pop_50.mp3 pop pop rock classical electronic
pop_53.mp3 pop pop rock electronic classical
pop_58.mp3 pop rock pop electronic classical
pop_61.mp3 pop pop rock classical electronic
pop_62.mp3 pop pop rock electronic classical
pop_64.mp3 pop rock pop electronic classical
pop_65.mp3 pop pop rock electronic classical
pop_70.mp3 pop electronic pop classical rock
pop_79.mp3 pop pop rock classical electronic
pop_80.mp3 pop pop rock electronic classical
pop_82.mp3 pop pop rock electronic classical
pop_85.mp3 pop pop rock electronic classical
pop_91.mp3 pop rock pop electronic classical
pop_98.mp3 pop pop rock electronic classical
rock_18.mp3 rock rock pop electronic classical
rock_2.mp3 rock rock pop electronic classical
rock_23.mp3 rock rock pop electronic classical
rock_32.mp3 rock rock pop electronic classical
rock_45.mp3 rock rock pop electronic classical
rock_46.mp3 rock rock pop electronic classical
rock_48.mp3 rock pop rock electronic classical
rock_51.mp3 rock rock pop electronic classical
rock_52.mp3 rock rock pop electronic classical
rock_57.mp3 rock rock pop electronic classical
rock_6.mp3 rock rock pop electronic classical
rock_62.mp3 rock rock pop electronic classical
rock_63.mp3 rock pop rock electronic classical
rock_66.mp3 rock pop rock electronic classical
rock_73.mp3 rock electronic pop rock classical
rock_75.mp3 rock rock pop electronic classical
rock_78.mp3 rock rock pop electronic classical
rock_80.mp3 rock rock pop electronic classical
rock_85.mp3 rock rock pop electronic classical
rock_86.mp3 rock rock pop electronic classical
rock_88.mp3 rock rock pop electronic classical
rock_92.mp3 rock rock pop electronic classical
rock_93.mp3 rock pop rock electronic classical
### Interpretation of results:
The confusion matrix shows the true labels on the y-axis, the predicted values on the x-axis.
Classical music was predicted well, with 1 wrong classification for electronic.
The most misclassifications has pop, with a true positive rate of 44.44%, due to wrong classifications towards electronic (4) and rock (6).
A high correlation between rock and pop can also be seen in the correlation plot between prediction probabilities.
%% Cell type:code id:bbd99cb8 tags:
``` python
# test pickle saving & loading
# svc_path = BASE_PATH / "out" / "SVC"/ "clf.pickle"
# svc_path.parent.mkdir(parents=True, exist_ok=True)
#
# with open(svc_path, "wb") as file:
# pickle.dump(clf, file)
#
# with open(svc_path, "rb") as file:
# loaded = pickle.load(file)
# loaded.predict_proba(X_test_pca)
```
%% Cell type:code id:4a32007a tags:
``` python
# output
OUTPUT_PATH.mkdir(parents=True, exist_ok=True)
Path(OUTPUT_PATHS["clf"]).resolve().parent.mkdir(parents=True, exist_ok=True)
Path(OUTPUT_PATHS["submission"]).resolve().parent.mkdir(parents=True, exist_ok=True)
with open(OUTPUT_PATHS["clf"], "wb") as file:
pickle.dump(clf, file)
subm.to_csv(OUTPUT_PATHS["submission"], index=False)
```
%% Cell type:code id:99782035 tags:
``` python
# def get_result() -> pd.DataFrame:
# """ Return the produced artefact of this notebook """
# return result
```
......
source diff could not be displayed: it is too large. Options to address this: view the blob.
source diff could not be displayed: it is too large. Options to address this: view the blob.
......@@ -123,13 +123,13 @@ files = [
[[package]]
name = "anyio"
version = "4.2.0"
version = "4.3.0"
description = "High level compatibility layer for multiple asynchronous event loop implementations"
optional = false
python-versions = ">=3.8"
files = [
{file = "anyio-4.2.0-py3-none-any.whl", hash = "sha256:745843b39e829e108e518c489b31dc757de7d2131d53fac32bd8df268227bfee"},
{file = "anyio-4.2.0.tar.gz", hash = "sha256:e1875bb4b4e2de1669f4bc7869b6d3f54231cdced71605e6e64c9be77e3be50f"},
{file = "anyio-4.3.0-py3-none-any.whl", hash = "sha256:048e05d0f6caeed70d731f3db756d35dcc1f35747c8c403364a8332c630441b8"},
{file = "anyio-4.3.0.tar.gz", hash = "sha256:f75253795a87df48568485fd18cdd2a3fa5c4f7c5be8e5e36637733fce06fed6"},
]
[package.dependencies]
......@@ -2149,13 +2149,13 @@ test = ["flaky", "ipykernel (>=6.19.3)", "ipython", "ipywidgets", "nbconvert (>=
[[package]]
name = "nbconvert"
version = "7.16.0"
description = "Converting Jupyter Notebooks"
version = "7.16.1"
description = "Converting Jupyter Notebooks (.ipynb files) to other formats. Output formats include asciidoc, html, latex, markdown, pdf, py, rst, script. nbconvert can be used both as a Python library (`import nbconvert`) or as a command line tool (invoked as `jupyter nbconvert ...`)."
optional = false
python-versions = ">=3.8"
files = [
{file = "nbconvert-7.16.0-py3-none-any.whl", hash = "sha256:ad3dc865ea6e2768d31b7eb6c7ab3be014927216a5ece3ef276748dd809054c7"},
{file = "nbconvert-7.16.0.tar.gz", hash = "sha256:813e6553796362489ae572e39ba1bff978536192fb518e10826b0e8cadf03ec8"},
{file = "nbconvert-7.16.1-py3-none-any.whl", hash = "sha256:3188727dffadfdc9c6a1c7250729063d7bc78b355ad7aa023138afa030d1cd07"},
{file = "nbconvert-7.16.1.tar.gz", hash = "sha256:e79e6a074f49ba3ed29428ed86487bf51509d9aab613bd8522ac08f6d28fd7fd"},
]
[package.dependencies]
......@@ -2917,19 +2917,23 @@ typing-extensions = ">=4.6.0,<4.7.0 || >4.7.0"
[[package]]
name = "pydantic-settings"
version = "2.1.0"
version = "2.2.0"
description = "Settings management using Pydantic"
optional = false
python-versions = ">=3.8"
files = [
{file = "pydantic_settings-2.1.0-py3-none-any.whl", hash = "sha256:7621c0cb5d90d1140d2f0ef557bdf03573aac7035948109adf2574770b77605a"},
{file = "pydantic_settings-2.1.0.tar.gz", hash = "sha256:26b1492e0a24755626ac5e6d715e9077ab7ad4fb5f19a8b7ed7011d52f36141c"},
{file = "pydantic_settings-2.2.0-py3-none-any.whl", hash = "sha256:5f7bcaf9ad4419559dc5ac155c0324a9aeb2547c60471ee7c7d026f467a6b515"},
{file = "pydantic_settings-2.2.0.tar.gz", hash = "sha256:648d0a76673e69c51278979cba2e83cf16a23d57519bfd7e553d1c3f37db5560"},
]
[package.dependencies]
pydantic = ">=2.3.0"
python-dotenv = ">=0.21.0"
[package.extras]
toml = ["tomlkit (>=0.12)"]
yaml = ["pyyaml (>=6.0.1)"]
[[package]]
name = "pygments"
version = "2.17.2"
......@@ -3961,13 +3965,13 @@ dev = ["flake8", "flake8-annotations", "flake8-bandit", "flake8-bugbear", "flake
[[package]]
name = "urllib3"
version = "2.2.0"
version = "2.2.1"
description = "HTTP library with thread-safe connection pooling, file post, and more."
optional = false
python-versions = ">=3.8"
files = [
{file = "urllib3-2.2.0-py3-none-any.whl", hash = "sha256:ce3711610ddce217e6d113a2732fafad960a03fd0318c91faa79481e35c11224"},
{file = "urllib3-2.2.0.tar.gz", hash = "sha256:051d961ad0c62a94e50ecf1af379c3aba230c66c710493493560c0c223c49f20"},
{file = "urllib3-2.2.1-py3-none-any.whl", hash = "sha256:450b20ec296a467077128bff42b73080516e71b56ff59a60a02bef2232c4fa9d"},
{file = "urllib3-2.2.1.tar.gz", hash = "sha256:d0570876c61ab9e520d776c38acbbb5b05a776d3f9ff98a5c8fd5162a444cf19"},
]
[package.extras]
......
access:
files: public
files: restricted
record: public
files:
default_preview: null
......@@ -24,13 +24,13 @@ metadata:
publisher: TU Wien
related_identifiers:
- identifier: https://www2.projects.science.uu.nl/memotion/emotifydata/
relation:
relation_type:
id: isderivedfrom
resource_type:
id: sound
scheme: url
- identifier: https://gitlab.tuwien.ac.at/martin.weise/fairnb
relation:
relation_type:
id: isderivedfrom
resource_type:
id: software
......