*** Wartungsfenster jeden ersten Mittwoch vormittag im Monat ***

Skip to content
Snippets Groups Projects
Commit 7bcf150e authored by Mahler, Lukas's avatar Mahler, Lukas
Browse files

Merge branch 'dev' into 'master'

improve fairnb and rework notebooks

See merge request !3
parents 42e42e52 f4c1da63
No related branches found
No related tags found
1 merge request!3improve fairnb and rework notebooks
Showing
with 4177 additions and 3535 deletions
......@@ -6,3 +6,4 @@ dist
.idea
.mypy_cache
.pytest_cache
!config/example-*
\ No newline at end of file
......@@ -36,7 +36,7 @@ Lukas Mahler
## License
## References
Docker Image on invenio: https://researchdata.tuwien.ac.at/
DBRepo: https://dbrepo1.tuwien.ac.at/
Invenio: https://researchdata.tuwien.ac.at/
Docker Image on TUWRD: https://researchdata.tuwien.ac.at/ \
DBRepo: https://dbrepo1.tuwien.ac.at/ \
Code backup on TUWRD: https://researchdata.tuwien.ac.at/ \
Thesis: https://repositum.tuwien.ac.at/
host: https://dbrepo1.ec.tuwien.ac.at
container-id: <insert id>
database-id: <insert id>
database-id: <insert database id>
credentials:
username: <insert username from dbrepo>
password: <insert password from dbrepo>
......
import pathlib
import logging
from tusclient import client
from datetime import datetime
from datetime import datetime, timedelta
from functools import wraps
from typing import Any, Callable
import requests
......@@ -17,15 +17,16 @@ CHUNK_SIZE = 1024 * 1024 * 100
def re_auth(func: Callable) -> Callable:
@wraps(func)
def inner(self, *args, **kwargs):
assert self.get_token_age()
age_seconds = (datetime.now() - self.get_token_age()).seconds
if 60 * 10 < age_seconds < 60 * 25:
LOG.warning(f"Re-authenticating due to almost expired token")
self.refresh_token_keycloak()
if age_seconds >= 60*25:
LOG.warning(f"Re-login due to expired token")
self.authenticate_keycloak()
assert self.get_token_expiry()
seconds_token_expired = (self.get_token_expiry() - datetime.now()).total_seconds()
seconds_refresh_expired = (self.get_refresh_expiry() - datetime.now()).total_seconds()
if seconds_token_expired < 60:
if seconds_refresh_expired > 60:
LOG.warning(f"Re-authenticating due to (almost) expired token")
self.refresh_token_keycloak()
else:
LOG.warning(f"Re-authenticating due to (almost) expired refresh token")
self.authenticate_keycloak()
return func(self, *args, **kwargs)
return inner
......@@ -39,17 +40,18 @@ class DBRepoConnector:
password: str,
client_secret_key: str,
host: str,
container_id: str,
# container_id: str,
database_id: str):
self.__token = None
self.__token_age: datetime = None
self.__token_expiry: datetime = None
self.__refresh_expiry: datetime = None
self.__refresh_token: str = None
self.headers = None
self.host = host
self.__username = username
self.__password = password
self.__client_secret_key = client_secret_key
self.container_id = container_id
# self.container_id = container_id
self.database_id = database_id
self.__keycloak_openid = KeycloakOpenID(
server_url=f"{host}/api/auth/",
......@@ -61,11 +63,14 @@ class DBRepoConnector:
self.tusclient = client.TusClient(
f"{self.host}/api/upload/files/",
# headers=self.headers
headers={'Content-Type': 'application/offset+octet-stream'}
# headers={'Content-Type': 'application/offset+octet-stream'}
)
def get_token_age(self) -> datetime:
return self.__token_age
def get_token_expiry(self) -> datetime:
return self.__token_expiry
def get_refresh_expiry(self) -> datetime:
return self.__refresh_expiry
@classmethod
def from_config(cls, config: dict, credentials: dict):
......@@ -74,7 +79,7 @@ class DBRepoConnector:
credentials["password"],
credentials["client_secret_key"],
config["host"],
config["container-id"],
# config["container-id"],
config["database-id"]
)
......@@ -110,19 +115,21 @@ class DBRepoConnector:
self.__token = token["access_token"]
self.__refresh_token = token["refresh_token"]
self.__token_age = datetime.now()
self.__token_expiry = datetime.now() + timedelta(seconds=token["expires_in"])
self.__refresh_expiry = datetime.now() + timedelta(seconds=token["refresh_expires_in"])
self.headers = {"Authorization": f"Bearer {self.__token}"}
return token
def refresh_token_keycloak(self, token: str = None):
token = self.__token if token is None else token
# token = self.__token if token is None else token
token = self.__keycloak_openid.refresh_token(self.__refresh_token)
self.__token = token["access_token"]
self.__refresh_token = token["refresh_token"]
self.__token_expiry = datetime.now() + timedelta(seconds=token["expires_in"])
self.__refresh_expiry = datetime.now() + timedelta(seconds=token["refresh_expires_in"])
self.headers = {"Authorization": f"Bearer {self.__token}"}
self.__token_age = datetime.now()
return token
@staticmethod
......@@ -249,26 +256,28 @@ class DBRepoConnector:
)
upload_url = uploader.create_url()
uploader.set_url(upload_url.replace('http', 'https')) # FIX: wrong location response
upload_url = upload_url.replace('http', 'https')
uploader.set_url(upload_url) # FIX: wrong location response
uploader.upload()
response_upload_import = requests.post(
f"{self.host}/api/database/{self.database_id}/table/{table_id}/data/import",
json={
"false_element": None,
"location": f"/tmp/{upload_url.split('/')[-1]}",
"false_element": "False",
"location": f"{upload_url.split('/')[-1].split('+')[0]}",
"null_element": None,
"quote": '"',
"separator": ",",
"skip_lines": 1,
"true_element": None
"true_element": "True"
},
headers=self.headers
)
LOG.debug(response_upload_import)
LOG.debug(f"Uploaded dataframe using tui: {response_upload_import}")
if not response_upload_import.ok:
LOG.warning(f"Move for table {table_id} failed: {response_upload_import}")
raise Exception(f"Move for table {table_id} failed: {response_upload_import}")
@re_auth
def delete_all_data(self, table_id: str):
......
from datetime import datetime
import pandas as pd
from dataclasses import dataclass, field
from pathlib import Path
......@@ -11,11 +13,12 @@ class DbRepoEntity(Entity):
table_name: str = field(init=True, default=None)
table_description: str = field(init=True, default="")
table_id: int = field(init=False, default=None)
repository: str = field(init=False, default="https://dbrepo1.ec.tuwien.ac.at/")
def __post_init__(self):
super().__post_init__()
if self.metadata is not None: # equivalent to: self.id is not None
if self.metadata is not None: # equivalent to: self.id is not None
self.table_id = int(self.metadata.uri.split("/")[-1])
else:
assert self.table_name is not None # has to exist fot the ability to get table_id
......@@ -51,9 +54,15 @@ class DbRepoEntity(Entity):
df = self.dbrepo_connector.download_table_as_df(str(self.table_id))
df = df[df['entity_id'] == self.id] # save only entity, not whole table
df.to_csv(self.location)
df = df.drop(columns=["entity_id", "id"])
# create dir if not exists
self.location.resolve().parent.mkdir(parents=True, exist_ok=True)
df.to_csv(self.location, index=False)
def upload(self, executed_file: Path, dependencies: list[Entity] = None):
def upload(self, executed_file: Path, dependencies: list[Entity] = None,
start_time: datetime = datetime.now(),
end_time: datetime = datetime.now()):
df = pd.read_csv(self.location)
# add id column to df:
......@@ -71,10 +80,12 @@ class DbRepoEntity(Entity):
self.name,
self.description,
executed_file=executed_file,
uri=f"{self.dbrepo_connector.host}/api/database/"
uri=f"{self.dbrepo_connector.host}/database/"
f"{self.dbrepo_connector.database_id}/table/{self.table_id}",
type=self.type,
platform="dbrepo",
platform=self.repository,
started_at=start_time,
ended_at=end_time
)
self.upload_provenance(metadata)
......@@ -91,5 +102,4 @@ class DbRepoEntity(Entity):
assert self.id is not None
assert self.table_id is not None
df["id"] = self.id # add entity id to df
self.dbrepo_connector.upload_data(df, str(self.table_id))
......@@ -2,6 +2,7 @@ import copy
import logging
from abc import ABC, abstractmethod
from dataclasses import dataclass, field
from datetime import datetime
from pathlib import Path
import pandas as pd
......@@ -10,8 +11,8 @@ from fairnb.api.dbrepo import DBRepoConnector
from fairnb.entity.entity_provenance import EntityProvenance
PROVENANCE_TABLE_NAME = "entity_provenance"
DEPENDENCY_TABLE_NAME = "entity_dependencies"
PROVENANCE_TABLE_NAME = "entity_provenance_test3"
DEPENDENCY_TABLE_NAME = "entity_dependencies_test3"
LOG = logging.getLogger(__name__)
# TODO: Upload Datetime objects as Timestamps instead of str
......@@ -69,7 +70,7 @@ class Entity(ABC):
raise NotImplementedError
@abstractmethod
def upload(self, executed_file: Path, dependencies=None):
def upload(self, executed_file: Path, dependencies: list, started_at=datetime.now(), ended_at=datetime.now()):
"""Upload this Entity"""
raise NotImplementedError
......@@ -125,7 +126,7 @@ class Entity(ABC):
# FIXME: create robust version of id retrieval, if possible
row = df.iloc[df["id"].idxmax()] # get the newest row, as it should contain the correct data
meta = EntityProvenance.from_series(row)
assert meta.creation_time == provenance.creation_time and meta.name == provenance.name
assert meta.started_at == provenance.started_at and meta.name == provenance.name
self.id = meta.id
self.metadata = meta
......@@ -139,7 +140,7 @@ class Entity(ABC):
df = pd.DataFrame(
{
"entity_id": pd.Series(dtype="int"),
"depends_on": pd.Series(dtype="int"),
"was_derived_from": pd.Series(dtype="int"),
}
)
......@@ -150,7 +151,7 @@ class Entity(ABC):
df = pd.concat([
df,
pd.DataFrame([{"entity_id": self.id,
"depends_on": dependency.id}])
"was_derived_from": dependency.id}])
])
else:
LOG.warning("Dependency has no id, skipping dependency upload")
......@@ -161,17 +162,17 @@ class Entity(ABC):
df = provenance.to_frame().drop("id", axis=1)
return self.dbrepo_connector.create_table_if_not_exists(
df, PROVENANCE_TABLE_NAME, "A table containing Provence information on all persisted Entities."
df, PROVENANCE_TABLE_NAME, "Provence information on persisted Entities created by FAIRnb."
)
def create_dependency_table_if_not_exists(self):
df = pd.DataFrame(
{
"entity_id": pd.Series(dtype="int"),
"depends_on": pd.Series(dtype="int"),
"was_derived_from": pd.Series(dtype="int"),
}
)
return self.dbrepo_connector.create_table_if_not_exists(
df, DEPENDENCY_TABLE_NAME, "Entity dependencies on other entities"
df, DEPENDENCY_TABLE_NAME, "Entity dependencies, tracking the lineage of entities, according to wasDerivedFrom relation of PROV-O."
)
......@@ -23,7 +23,8 @@ class EntityProvenance:
branch: str # the branch of the repository, makes manual search of commit easier
repo_uri: str # the uri of the repository, used to locate the repository
executed_file: str # path to notebook which was executed to create the entity
creation_time: datetime # timestamp of creation time of entity
started_at: datetime # start time of execution where entity was created
ended_at: datetime # end time of execution where entity was created
platform: str # platform on which the entity is uploaded (e.g. dbrepo, invenio, ...)
@classmethod
......@@ -35,6 +36,8 @@ class EntityProvenance:
type: str,
uri: str,
platform: str,
started_at: datetime,
ended_at: datetime
):
repo = git.Repo(BASE_PATH)
......@@ -42,14 +45,10 @@ class EntityProvenance:
branch = git_branch.name
commit = git_branch.repo.commit().hexsha
# TODO: Better way to point to repo instead of ssh / https link
# --> more general approach independent of authentication
repo_uri = git_branch.repo.remote().url
if repo_uri.startswith("ssh://"):
repo_uri = re.sub(":\d+/", "/", f"https://{repo_uri.split('@', 1)[1]}")
creation_time = datetime.now()
executed_file_rel = executed_file.resolve().relative_to(BASE_PATH)
return cls(
......@@ -59,7 +58,8 @@ class EntityProvenance:
uri=uri,
commit=commit,
repo_uri=repo_uri,
creation_time=creation_time,
started_at=started_at,
ended_at=ended_at,
branch=branch,
executed_file=executed_file_rel.as_posix(),
type=type,
......@@ -74,14 +74,17 @@ class EntityProvenance:
description=df["description"],
uri=df["uri"],
commit=df["commit"],
repo_uri=df["repo_uri"],
repo_uri=df["git_uri"],
executed_file=df["executed_file"],
creation_time=datetime.strptime(
df["creation_time"], "%Y-%m-%d %H:%M:%S.%f"
started_at=datetime.strptime(
df["started_at"], "%Y-%m-%d %H:%M:%S.%f"
), # TODO: replace with '%F %T'
ended_at=datetime.strptime(
df["ended_at"], "%Y-%m-%d %H:%M:%S.%f"
),
branch=df["branch"],
type=df["type"],
platform=df["platform"],
platform=df["repository"],
)
def to_frame(self):
......@@ -92,11 +95,12 @@ class EntityProvenance:
"description": pd.Series(self.description, dtype=str),
"uri": pd.Series(self.uri, dtype=str),
"commit": pd.Series(self.commit, dtype=str),
"repo_uri": pd.Series(self.repo_uri, dtype=str),
"git_uri": pd.Series(self.repo_uri, dtype=str),
"executed_file": pd.Series(self.executed_file, dtype=str),
"creation_time": pd.Series(self.creation_time, dtype=str),
"started_at": pd.Series(self.started_at, dtype=str),
"ended_at": pd.Series(self.ended_at, dtype=str),
"branch": pd.Series(self.branch, dtype=str),
"type": pd.Series(self.type, dtype=str),
"platform": pd.Series(self.platform, dtype=str),
"repository": pd.Series(self.platform, dtype=str),
}
)
from dataclasses import dataclass, field
from datetime import datetime
from pathlib import Path
from fairnb.api.dbrepo import DBRepoConnector
......@@ -12,6 +13,7 @@ class InvenioEntity(Entity):
invenio_manager: InvenioManager = field(init=True, default=None)
record_metadata: dict = field(init=True, default=None)
publish_record: bool = field(init=True, default=False)
platform: str = field(init=False, default="https://doi.org/10.17616/R31NJMYD")
@classmethod
def new(
......@@ -60,7 +62,7 @@ class InvenioEntity(Entity):
self.invenio_manager.record_id = self.metadata.uri.split('/')[-1]
def upload(self, executed_file: Path, dependencies: list[Entity] = None):
def upload(self, executed_file: Path, dependencies: list[Entity] = None, started_at=datetime.now(), ended_at=datetime.now()):
dir_path: Path
regex: str
......@@ -89,9 +91,11 @@ class InvenioEntity(Entity):
name=self.name,
description=self.description,
executed_file=executed_file,
uri=uri,
uri=uri.replace('/api', ''),
type=self.type,
platform="invenio",
platform=self.platform,
started_at=started_at,
ended_at=ended_at,
)
self.upload_provenance(metadata)
......
from datetime import datetime
import papermill
from nbconvert.preprocessors import ExecutePreprocessor
from fairnb.entity.entity import Entity
from fairnb.nb_config import NbConfig
class Executor:
@staticmethod
def download_dependencies(nb_config: NbConfig, require_download: bool = False):
""" Set up the dependencies to allow for later execution """
"""Set up the dependencies to allow for later execution"""
# download dependencies if not already present
[entity.download() for entity in nb_config.dependencies
if (not entity.exists_locally()) or require_download]
[
entity.download()
for entity in nb_config.dependencies
if (not entity.exists_locally()) or require_download
]
@classmethod
def execute(cls, nb_config: NbConfig, require_download: bool = False, only_local: bool = False, **kwargs):
""" Execute the notebook specified in nb_config by providing nb_config.dependencies
def execute(
cls,
nb_config: NbConfig,
require_download: bool = False,
only_local: bool = False,
**kwargs
):
"""Execute the notebook specified in nb_config by providing nb_config.dependencies
and upload the generated Entities if only_local is False.
"""
......@@ -25,7 +36,12 @@ class Executor:
if not only_local:
cls.download_dependencies(nb_config, require_download)
started_at = datetime.now()
cls.execute_notebook(nb_config)
ended_at = datetime.now()
nb_config.started_at = started_at
nb_config.ended_at = ended_at
if not only_local:
cls.upload_entities(nb_config)
......@@ -39,9 +55,15 @@ class Executor:
nb_config.nb_location.resolve(),
nb_config.nb_location.resolve(),
parameters=dict(
INPUT_PATHS={entity.type: entity.location.__str__() for entity in nb_config.dependencies},
OUTPUT_PATHS={entity.type: entity.location.__str__() for entity in nb_config.entities}
)
INPUT_PATHS={
entity.type: entity.location.__str__()
for entity in nb_config.dependencies
},
OUTPUT_PATHS={
entity.type: entity.location.__str__()
for entity in nb_config.entities
},
),
)
@staticmethod
......@@ -49,36 +71,9 @@ class Executor:
# load generated entity and upload it
for entity in nb_config.entities:
# use inspect to get path of caller
entity.upload(nb_config.nb_location, nb_config.dependencies)
def reproduce_entity(self, entity: Entity):
pass
# TODO: additional functionality of executor class
# class ExperimentReproducer:
# def __init__(self):
# pass
# # self.config = self.configure()
#
# def entity(self, creation_func, *args, input_entity: [str] = None, **kwargs):
# """ Saves the created entity which is returned by the creation_func as a DRO """
#
# if input_entity is not None:
# # TODO: collect input entities
# collected_entities = []
# for entity in input_entities:
# collected_entities.append(self.collect_entity(entity))
#
# result = creation_func(args, input_entities, kwargs)
#
# return result
#
# def db_repo_entity(self, function: Callable[..., pd.DataFrame], *args, **kwargs):
# """ Saves the created dataframe to DBRepo while citing the inputs. """
# df = function(args, kwargs)
#
# # TODO: upload code to DBRepo
#
# def recreate_entity(self, uri: str):
# """ Checkout correct commit, download required artefacts and execute correct artefact code. """
# pass
entity.upload(
nb_config.nb_location,
nb_config.dependencies,
nb_config.started_at,
nb_config.ended_at
)
from dataclasses import dataclass, field
from datetime import datetime
from pathlib import Path
from fairnb.entity.entity import Entity
......@@ -10,6 +11,8 @@ class NbConfig:
entities: list[Entity]
dependencies: list[Entity]
nb_output_location: Path = field(init=True, default=None)
started_at: datetime = field(init=True, default=datetime.now())
ended_at: datetime = field(init=True, default=datetime.now())
def __post_init__(self):
if not self.nb_output_location:
......
%% Cell type:markdown id:4389a8092677254e tags:
# Audio Files
Bundle the provided audio files (400, in MP3) in a tar, encrypt it using gzip and store it in the output folder.
%% Cell type:code id:87ab37c6 tags:
``` python
from definitions import BASE_PATH
import tarfile
import zipfile
import os
from pathlib import Path
```
%% Cell type:code id:1b4e6b01 tags:parameters
``` python
# Parameters
INPUT_PATHS = {}
OUTPUT_PATHS = {
"audio_tar": str(BASE_PATH / "tmp/1_audio_files/output/emotifymusic.tar.gz")
}
```
%% Cell type:code id:a0c3731f tags:injected-parameters
%% Cell type:code id:15dea136 tags:injected-parameters
``` python
# Parameters
INPUT_PATHS = {}
OUTPUT_PATHS = {
"audio_tar": "/home/lukas/Programming/uni/bachelorarbeit/dbrepo-ismir/tmp/1_audio_files/output/emotifymusic.tar.gz"
"audio_tar": "/home/lukas/Programming/uni/bachelorarbeit/fairnb/tmp/1_audio_files/output/emotifymusic.tar.gz"
}
```
%% Cell type:code id:1e487573 tags:
``` python
# load provided files
zip_path = BASE_PATH / "resource" / "1_audio_files" / "emotifymusic.zip"
dir_path = BASE_PATH / "tmp" / "1_audio_files" / "music"
dir_path.mkdir(parents=True, exist_ok=True)
# unzip to dir_path
with zipfile.ZipFile(zip_path, "r") as zfile:
zfile.extractall(path=dir_path)
```
%% Cell type:code id:c3193f35 tags:
``` python
file_paths = list(dir_path.rglob('**/*.*'))
flattened_dir_path = BASE_PATH / "tmp" / "1_audio_files" / "flattened"
flattened_dir_path.mkdir(parents=True, exist_ok=True)
for path in file_paths:
(flattened_dir_path / path.relative_to(dir_path).as_posix().replace('/', '_')).write_bytes(path.read_bytes())
```
%% Cell type:code id:3272ea2b tags:
``` python
tar_path = Path(OUTPUT_PATHS["audio_tar"])
tar_path.parent.mkdir(parents=True, exist_ok=True)
with tarfile.open(tar_path, "w:gz") as file:
file.add(flattened_dir_path, arcname=os.path.sep)
```
......
%% Cell type:markdown id:f48a4573 tags:
# Aggregate MFCC Features
Aggregate from n rows par file to 1 (calculate min, max, etc. for each feature).
%% Cell type:code id:389576b8 tags:
``` python
from pathlib import Path
import pandas as pd
from definitions import BASE_PATH
```
%% Cell type:code id:26f640e0 tags:parameters
``` python
INPUT_PATH = BASE_PATH / "tmp" / "3_aggregate_features" / "input"
OUTPUT_PATH = BASE_PATH / "tmp" / "3_aggregate_features" / "output"
INPUT_PATHS: dict[str, str] = {
"raw_features": (INPUT_PATH / "raw_features.csv").__str__()
}
OUTPUT_PATHS: dict[str, str] = {
"features": (OUTPUT_PATH / "features.csv").__str__()
}
```
%% Cell type:code id:70fd8bf2 tags:injected-parameters
%% Cell type:code id:88ecee07 tags:injected-parameters
``` python
# Parameters
INPUT_PATHS = {
"raw_features": "/home/lukas/Programming/uni/bachelorarbeit/dbrepo-ismir/tmp/3_aggregate_features/input/raw_features.csv"
"raw_features": "/home/lukas/Programming/uni/bachelorarbeit/fairnb/tmp/3_aggregate_features/input/raw_features.csv"
}
OUTPUT_PATHS = {
"aggregated_features": "/home/lukas/Programming/uni/bachelorarbeit/dbrepo-ismir/tmp/3_aggregate_features/output/features.csv"
"aggregated_features": "/home/lukas/Programming/uni/bachelorarbeit/fairnb/tmp/3_aggregate_features/output/features.csv"
}
```
%% Cell type:code id:c5d9d980 tags:
``` python
# inputs
raw_features = pd.read_csv(INPUT_PATHS["raw_features"], index_col=False)
```
%% Cell type:code id:99f75f47 tags:
``` python
meta_columns = ["sample", "filename", "label"]
mfcc_aggregated = raw_features\
.drop(meta_columns, axis=1, errors='ignore')\
.groupby(raw_features.filename).agg(['min', 'max', 'mean', 'std', 'skew'])
mfcc_meta = pd.DataFrame(raw_features['label'].groupby(raw_features.filename).last())
mfcc_meta.columns = pd.MultiIndex.from_arrays([['label'], ['']]) # needed for merge
mfcc_merged = pd.merge(mfcc_meta, mfcc_aggregated, left_index=True, right_index=True)
# reduce multi index to single index
one_level_cols = ['_'.join([str(el) for el in col]) for col in mfcc_merged.columns[1:]]
one_level_cols.insert(0, "label")
mfcc_merged.columns = pd.Index(one_level_cols)
mfcc_merged = mfcc_merged.reset_index()
mfcc_merged
```
%% Output
filename label 0_min 0_max 0_mean \
0 classical_1.mp3 classical -530.78436 -163.308350 -302.203167
1 classical_10.mp3 classical -562.85785 -96.164795 -219.259016
2 classical_100.mp3 classical -536.23737 -61.608826 -177.804114
3 classical_11.mp3 classical -536.45746 -120.429665 -222.126303
4 classical_12.mp3 classical -562.67523 -148.133560 -270.975406
.. ... ... ... ... ...
395 rock_95.mp3 rock -553.11010 -5.218835 -193.506047
396 rock_96.mp3 rock -541.23600 27.163332 -119.113996
396 rock_96.mp3 rock -541.23600 27.163334 -119.113996
397 rock_97.mp3 rock -518.49500 58.526745 -66.267744
398 rock_98.mp3 rock -518.64307 53.555115 -45.734517
399 rock_99.mp3 rock -544.70310 75.612130 -49.380943
0_std 0_skew 1_min 1_max 1_mean ... 38_min \
0 51.142183 -0.468374 0.000000 178.75162 111.332342 ... -44.098070
1 53.561839 -0.772320 0.029056 259.63272 215.094182 ... -27.458416
1 53.561838 -0.772320 0.029056 259.63270 215.094182 ... -27.458416
2 83.381622 -2.587179 0.000000 190.47589 112.471713 ... -27.335688
3 76.246992 -2.402418 0.000000 159.42575 99.853645 ... -31.774948
4 52.191182 -0.366586 0.000000 194.26416 148.226648 ... -44.843815
4 52.191182 -0.366586 0.000000 194.26416 148.226647 ... -44.843810
.. ... ... ... ... ... ... ...
395 76.869437 -0.201055 -89.948746 201.18045 111.724191 ... -27.043941
396 58.420684 -0.957699 -7.415959 210.49246 125.453699 ... -37.584858
396 58.420684 -0.957699 -7.415961 210.49246 125.453699 ... -37.584858
397 65.635619 -0.898026 -58.824410 175.20135 99.288265 ... -29.620445
398 52.444200 -1.705641 0.000000 187.04272 96.440874 ... -26.967852
399 54.045627 -0.863093 -32.930650 191.73538 93.971242 ... -21.929403
398 52.444200 -1.705641 0.000000 187.04274 96.440874 ... -26.967848
399 54.045627 -0.863093 -32.930653 191.73538 93.971242 ... -21.929403
38_max 38_mean 38_std 38_skew 39_min 39_max 39_mean \
0 47.308060 -3.713503 16.553984 0.230691 -46.794480 49.352516 -2.282116
1 29.811110 0.484271 8.660648 -0.479016 -28.989979 27.533707 0.952658
2 27.610388 -0.333233 8.185075 0.208425 -38.095375 31.397882 -1.494916
3 31.500881 -3.781627 9.191043 0.260886 -22.667439 50.992905 1.600777
4 28.490644 -6.242015 10.546545 0.341848 -25.040886 46.878204 1.844494
1 29.811110 0.484271 8.660648 -0.479016 -28.989983 27.533710 0.952658
2 27.610388 -0.333233 8.185075 0.208425 -38.095375 31.397880 -1.494916
3 31.500881 -3.781627 9.191043 0.260886 -22.667440 50.992897 1.600777
4 28.490644 -6.242015 10.546545 0.341848 -25.040888 46.878204 1.844494
.. ... ... ... ... ... ... ...
395 22.451445 -7.234634 8.471853 0.753855 -24.712723 23.410387 -4.502398
396 28.087940 -9.704238 8.447620 0.112760 -38.147890 21.814400 -8.249507
397 26.325895 -5.722825 7.727378 0.207489 -29.497524 25.410656 -3.356614
398 8.714736 -9.511491 5.551820 -0.025604 -23.020082 13.948639 -2.664985
399 17.050608 -5.296691 5.894962 0.390705 -20.983192 29.312021 -0.321836
396 28.087936 -9.704238 8.447620 0.112760 -38.147890 21.814402 -8.249507
397 26.325895 -5.722825 7.727378 0.207489 -29.497524 25.410654 -3.356614
398 8.714737 -9.511491 5.551820 -0.025604 -23.020084 13.948638 -2.664985
399 17.050608 -5.296691 5.894963 0.390705 -20.983192 29.312023 -0.321836
39_std 39_skew
0 15.285639 0.171462
1 10.477735 -0.185771
2 10.917299 0.020985
3 10.125545 0.595763
4 11.160392 0.503120
.. ... ...
395 6.687983 0.238807
395 6.687984 0.238807
396 7.807756 0.071968
397 8.170526 0.160330
398 5.051498 -0.258407
399 6.571660 0.384794
[400 rows x 202 columns]
%% Cell type:code id:4ac5c765 tags:
``` python
# outputs
aggregated_features_path = Path(OUTPUT_PATHS["aggregated_features"]).resolve()
aggregated_features_path.parent.mkdir(parents=True, exist_ok=True)
output = mfcc_merged
output.to_csv(aggregated_features_path, index=False)
```
......
%% Cell type:markdown id:e92b4fe9 tags:
# Split the Features into Train and Test Set
%% Cell type:code id:5f1fae44 tags:
``` python
import pandas as pd
from pathlib import Path
from definitions import BASE_PATH
```
%% Cell type:code id:01de1b27 tags:parameters
``` python
# Tagged with 'parameters'
from definitions import BASE_PATH
INPUT_PATHS: dict[str, str] = {
"features": (BASE_PATH / "tmp" / "4_split" / "input" / "features.csv").__str__()
}
OUTPUT_PATHS: dict[str, str] = {
"split": (BASE_PATH / "tmp" / "4_split" / "output" / "split.csv").__str__()
}
```
%% Cell type:code id:d8169758 tags:injected-parameters
%% Cell type:code id:e99ca0ba tags:injected-parameters
``` python
# Parameters
INPUT_PATHS = {
"aggregated_features": "/home/lukas/Programming/uni/bachelorarbeit/dbrepo-ismir/tmp/4_split/input/features.csv"
"aggregated_features": "/home/lukas/Programming/uni/bachelorarbeit/fairnb/tmp/4_split/input/features.csv"
}
OUTPUT_PATHS = {
"split": "/home/lukas/Programming/uni/bachelorarbeit/dbrepo-ismir/tmp/4_split/output/split.csv"
"split": "/home/lukas/Programming/uni/bachelorarbeit/fairnb/tmp/4_split/output/split.csv"
}
```
%% Cell type:code id:a4cc6800 tags:
``` python
# INPUT
for path in INPUT_PATHS.values():
assert Path(path).exists()
features = pd.read_csv(INPUT_PATHS["aggregated_features"])
```
%% Cell type:code id:a186d0c4 tags:
``` python
train = features.sample(frac=0.8).sort_index()
test = features.drop(train.index)
split_true = pd.DataFrame({
"filename": train.filename,
"train": True
})
split_false = pd.DataFrame({
"filename": test.filename,
"train": False
})
split_concat = pd.concat([split_true, split_false])\
.sort_values("filename")\
.reset_index(drop=True)
```
%% Cell type:code id:091e0641 tags:
``` python
split_concat
```
%% Output
filename train
0 classical_1.mp3 False
1 classical_10.mp3 True
2 classical_100.mp3 False
3 classical_11.mp3 True
4 classical_12.mp3 True
.. ... ...
395 rock_95.mp3 True
395 rock_95.mp3 False
396 rock_96.mp3 True
397 rock_97.mp3 True
398 rock_98.mp3 True
399 rock_99.mp3 True
[400 rows x 2 columns]
%% Cell type:code id:7b11b8bb tags:
``` python
# output
OUTPUT_PATH = Path(OUTPUT_PATHS["split"])
OUTPUT_PATH.parent.mkdir(parents=True, exist_ok=True)
output = split_concat
output.to_csv(OUTPUT_PATH, index=False)
```
......
This diff is collapsed.
source diff could not be displayed: it is too large. Options to address this: view the blob.
This diff is collapsed.
This diff is collapsed.
......@@ -24,6 +24,7 @@ black = "^23.7.0"
oic = "^1.6.1"
python-keycloak = "^3.3.0"
tuspy = "^1.0.1"
seaborn = "^0.13.2"
[tool.poetry.group.dev.dependencies]
......
# dbrepo-ismir
install: llvmlite, numba, librosa
apt installs needed: git-lfs, libsndfile1
## Getting started
To make it easy for you to get started with GitLab, here's a list of recommended next steps.
Already a pro? Just edit this README.md and make it your own. Want to make it easy? [Use the template at the bottom](#editing-this-readme)!
## Add your files
- [x] [Create](https://docs.gitlab.com/ee/user/project/repository/web_editor.html#create-a-file) or [upload](https://docs.gitlab.com/ee/user/project/repository/web_editor.html#upload-a-file) files
- [x] [Add files using the command line](https://docs.gitlab.com/ee/gitlab-basics/add-file.html#add-a-file-using-the-command-line) or push an existing Git repository with the following command:
```
cd existing_repo
git remote add origin https://gitlab.tuwien.ac.at/martin.weise/dbrepo-ismir.git
git branch -M main
git push -uf origin main
```
## Integrate with your tools
- [x] [Set up project integrations](https://gitlab.tuwien.ac.at/martin.weise/dbrepo-ismir/-/settings/integrations)
## Collaborate with your team
- [x] [Invite team members and collaborators](https://docs.gitlab.com/ee/user/project/members/)
- [x] [Create a new merge request](https://docs.gitlab.com/ee/user/project/merge_requests/creating_merge_requests.html)
- [x] [Automatically close issues from merge requests](https://docs.gitlab.com/ee/user/project/issues/managing_issues.html#closing-issues-automatically)
- [x] [Enable merge request approvals](https://docs.gitlab.com/ee/user/project/merge_requests/approvals/)
- [x] [Automatically merge when pipeline succeeds](https://docs.gitlab.com/ee/user/project/merge_requests/merge_when_pipeline_succeeds.html)
## Test and Deploy
Use the built-in continuous integration in GitLab.
- [ ] [Get started with GitLab CI/CD](https://docs.gitlab.com/ee/ci/quick_start/index.html)
- [ ] [Analyze your code for known vulnerabilities with Static Application Security Testing(SAST)](https://docs.gitlab.com/ee/user/application_security/sast/)
- [ ] [Deploy to Kubernetes, Amazon EC2, or Amazon ECS using Auto Deploy](https://docs.gitlab.com/ee/topics/autodevops/requirements.html)
- [ ] [Use pull-based deployments for improved Kubernetes management](https://docs.gitlab.com/ee/user/clusters/agent/)
- [ ] [Set up protected environments](https://docs.gitlab.com/ee/ci/environments/protected_environments.html)
***
# Editing this README
When you're ready to make this README your own, just edit this file and use the handy template below (or feel free to structure it however you want - this is just a starting point!). Thank you to [makeareadme.com](https://www.makeareadme.com/) for this template.
## Suggestions for a good README
Every project is different, so consider which of these sections apply to yours. The sections used in the template are suggestions for most open source projects. Also keep in mind that while a README can be too long and detailed, too long is better than too short. If you think your README is too long, consider utilizing another form of documentation rather than cutting out information.
## Name
DBRepo-ISMIR
## Description
A Repository that shows the usage of the DBRepo and Invenio platforms to create a genre prediction SVM and make its results reproducible.
## Badges
On some READMEs, you may see small images that convey metadata, such as whether or not all the tests are passing for the project. You can use Shields to add some to your README. Many services also have instructions for adding a badge.
## Visuals
Depending on what you are making, it can be a good idea to include screenshots or even a video (you'll frequently see GIFs rather than actual videos). Tools like ttygif can help, but check out Asciinema for a more sophisticated method.
## Installation
Within a particular ecosystem, there may be a common way of installing things, such as using Yarn, NuGet, or Homebrew. However, consider the possibility that whoever is reading your README is a novice and would like more guidance. Listing specific steps helps remove ambiguity and gets people to using your project as quickly as possible. If it only runs in a specific context like a particular programming language version or operating system or has dependencies that have to be installed manually, also add a Requirements subsection.
## Usage
Use examples liberally, and show the expected output if you can. It's helpful to have inline the smallest example of usage that you can demonstrate, while providing links to more sophisticated examples if they are too long to reasonably include in the README.
## Support
Tell people where they can go to for help. It can be any combination of an issue tracker, a chat room, an email address, etc.
## Roadmap
If you have ideas for releases in the future, it is a good idea to list them in the README.
## Contributing
State if you are open to contributions and what your requirements are for accepting them.
For people who want to make changes to your project, it's helpful to have some documentation on how to get started. Perhaps there is a script that they should run or some environment variables that they need to set. Make these steps explicit. These instructions could also be useful to your future self.
You can also document commands to lint the code or run tests. These steps help to ensure high code quality and reduce the likelihood that the changes inadvertently break something. Having instructions for running tests is especially helpful if it requires external setup, such as starting a Selenium server for testing in a browser.
## Authors and acknowledgment
Show your appreciation to those who have contributed to the project.
## License
For open source projects, say how it is licensed.
## Project status
If you have run out of energy or time for your project, put a note at the top of the README saying that development has slowed down or stopped completely. Someone may choose to fork your project or volunteer to step in as a maintainer or owner, allowing your project to keep going. You can also make an explicit request for maintainers.
......@@ -19,4 +19,19 @@ metadata:
publication_date: '2022-01-01'
resource_type:
id: sound
title: 'DBREPO ISMIR testing 1'
title: Flattened Emotify Dataset
description: "400 MP3 files of one minute playtime each, names are labeled with the respective genre, one of: classical, rock, pop and electronic."
publisher: TU Wien
related_identifiers:
- identifier: https://www2.projects.science.uu.nl/memotion/emotifydata/
relation:
id: isderivedfrom
resource_type:
id: sound
scheme: url
- identifier: https://gitlab.tuwien.ac.at/martin.weise/fairnb
relation:
id: isderivedfrom
resource_type:
id: software
scheme: url
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment