*** Wartungsfenster jeden ersten Mittwoch vormittag im Monat ***

Skip to content
Snippets Groups Projects

Compare revisions

Changes are shown as if the source revision was being merged into the target revision. Learn more about comparing revisions.

Source

Select target project
No results found

Target

Select target project
  • martin.weise/fairnb
1 result
Show changes
Commits on Source (6)
Showing
with 694 additions and 4228 deletions
host: https://dbrepo1.ec.tuwien.ac.at host: https://dbrepo1.ec.tuwien.ac.at
database-id: <insert database id> database-id: 23
credentials: credentials:
username: <insert username from dbrepo> username: <insert username from dbrepo>
password: <insert password from dbrepo> password: <insert password from dbrepo>
......
host: https://researchdata.tuwien.ac.at host: https://researchdata.tuwien.ac.at
credentials: credentials:
token: <insert token from invenio> token: <insert token of InvenioRDM API>
\ No newline at end of file \ No newline at end of file
...@@ -12,7 +12,7 @@ from keycloak import KeycloakOpenID ...@@ -12,7 +12,7 @@ from keycloak import KeycloakOpenID
LOG = logging.getLogger(__name__) LOG = logging.getLogger(__name__)
TIMEOUT = 600 TIMEOUT = 600
CHUNK_SIZE = 1024 * 1024 * 100 CHUNK_SIZE = 1024 * 1024 * 20
def re_auth(func: Callable) -> Callable: def re_auth(func: Callable) -> Callable:
@wraps(func) @wraps(func)
...@@ -210,10 +210,10 @@ class DBRepoConnector: ...@@ -210,10 +210,10 @@ class DBRepoConnector:
return None return None
@re_auth @re_auth
def create_table(self, dataframe: pd.DataFrame, table_name: str, table_descriptor: str): def create_table(self, dataframe: pd.DataFrame, table_name: str, table_description: str):
""" Creates a new table """ """ Creates a new table """
data = self._create_table_data(dataframe, table_name, table_descriptor) data = self._create_table_data(dataframe, table_name, table_description)
response = requests.post( response = requests.post(
f"{self.host}/api/database/{self.database_id}/table", f"{self.host}/api/database/{self.database_id}/table",
...@@ -234,10 +234,10 @@ class DBRepoConnector: ...@@ -234,10 +234,10 @@ class DBRepoConnector:
def create_table_if_not_exists(self, def create_table_if_not_exists(self,
dataframe: pd.DataFrame, dataframe: pd.DataFrame,
table_name: str, table_name: str,
table_descriptor: str table_description: str
): ):
table = table if (table := self.get_table(table_name)) is not None else \ table = table if (table := self.get_table(table_name)) is not None else \
self.create_table(dataframe, table_name, table_descriptor) self.create_table(dataframe, table_name, table_description)
return table return table
...@@ -269,6 +269,7 @@ class DBRepoConnector: ...@@ -269,6 +269,7 @@ class DBRepoConnector:
"quote": '"', "quote": '"',
"separator": ",", "separator": ",",
"skip_lines": 1, "skip_lines": 1,
"line_termination": "\n",
"true_element": "True" "true_element": "True"
}, },
headers=self.headers headers=self.headers
......
...@@ -8,7 +8,7 @@ import requests as rq ...@@ -8,7 +8,7 @@ import requests as rq
log = logging.getLogger(__name__) log = logging.getLogger(__name__)
class InvenioConnector: class InvenioRDMConnector:
def __init__(self, token: str, host: str | None = None): def __init__(self, token: str, host: str | None = None):
self.host = host or "https://test.researchdata.tuwien.ac.at" self.host = host or "https://test.researchdata.tuwien.ac.at"
self.token = token self.token = token
...@@ -130,14 +130,14 @@ class InvenioConnector: ...@@ -130,14 +130,14 @@ class InvenioConnector:
executor.map(lambda p: self.download_file(*p), args) executor.map(lambda p: self.download_file(*p), args)
class InvenioManager: class InvenioRDMManager:
"""A high level interface to up- and download files from invenio. """A high level interface to up- and download files from invenio.
Utilizes state management: Utilizes state management:
1. record not assigned 1. record not assigned
2. record assigned 2. record assigned
3. record published""" 3. record published"""
def __init__(self, invenio_connector: InvenioConnector, record_id: str | None = None): def __init__(self, invenio_connector: InvenioRDMConnector, record_id: str | None = None):
self.connector = invenio_connector self.connector = invenio_connector
self.record_id = record_id self.record_id = record_id
self.doi = None self.doi = None
......
...@@ -9,7 +9,7 @@ from fairnb.entity.entity_provenance import EntityProvenance ...@@ -9,7 +9,7 @@ from fairnb.entity.entity_provenance import EntityProvenance
@dataclass @dataclass
class DbRepoEntity(Entity): class DBRepoEntity(Entity):
table_name: str = field(init=True, default=None) table_name: str = field(init=True, default=None)
table_description: str = field(init=True, default="") table_description: str = field(init=True, default="")
table_id: int = field(init=False, default=None) table_id: int = field(init=False, default=None)
...@@ -19,7 +19,7 @@ class DbRepoEntity(Entity): ...@@ -19,7 +19,7 @@ class DbRepoEntity(Entity):
super().__post_init__() super().__post_init__()
if self.metadata is not None: # equivalent to: self.id is not None if self.metadata is not None: # equivalent to: self.id is not None
self.table_id = int(self.metadata.uri.split("/")[-1]) self.table_id = int(self.metadata.pi.split("/")[-1])
else: else:
assert self.table_name is not None # has to exist fot the ability to get table_id assert self.table_name is not None # has to exist fot the ability to get table_id
...@@ -60,9 +60,10 @@ class DbRepoEntity(Entity): ...@@ -60,9 +60,10 @@ class DbRepoEntity(Entity):
self.location.resolve().parent.mkdir(parents=True, exist_ok=True) self.location.resolve().parent.mkdir(parents=True, exist_ok=True)
df.to_csv(self.location, index=False) df.to_csv(self.location, index=False)
def upload(self, executed_file: Path, dependencies: list[Entity] = None, def upload(self, executed_file: Path, main_file: Path,
dependencies: list[Entity] = None,
start_time: datetime = datetime.now(), start_time: datetime = datetime.now(),
end_time: datetime = datetime.now()): end_time: datetime = datetime.now()) -> EntityProvenance:
df = pd.read_csv(self.location) df = pd.read_csv(self.location)
# add id column to df: # add id column to df:
...@@ -80,10 +81,11 @@ class DbRepoEntity(Entity): ...@@ -80,10 +81,11 @@ class DbRepoEntity(Entity):
self.name, self.name,
self.description, self.description,
executed_file=executed_file, executed_file=executed_file,
uri=f"{self.dbrepo_connector.host}/database/" main_file=main_file,
pi=f"{self.dbrepo_connector.host}/database/"
f"{self.dbrepo_connector.database_id}/table/{self.table_id}", f"{self.dbrepo_connector.database_id}/table/{self.table_id}",
type=self.type, type=self.type,
platform=self.repository, repository=self.repository,
started_at=start_time, started_at=start_time,
ended_at=end_time ended_at=end_time
) )
...@@ -98,6 +100,8 @@ class DbRepoEntity(Entity): ...@@ -98,6 +100,8 @@ class DbRepoEntity(Entity):
self.upload_dependencies(dependencies) self.upload_dependencies(dependencies)
return self.metadata
def upload_data(self, df: pd.DataFrame): def upload_data(self, df: pd.DataFrame):
assert self.id is not None assert self.id is not None
assert self.table_id is not None assert self.table_id is not None
......
...@@ -11,12 +11,13 @@ from fairnb.api.dbrepo import DBRepoConnector ...@@ -11,12 +11,13 @@ from fairnb.api.dbrepo import DBRepoConnector
from fairnb.entity.entity_provenance import EntityProvenance from fairnb.entity.entity_provenance import EntityProvenance
PROVENANCE_TABLE_NAME = "entity_provenance_test3" PROVENANCE_TABLE_NAME = "entity_provenance"
DEPENDENCY_TABLE_NAME = "entity_dependencies_test3" DEPENDENCY_TABLE_NAME = "entity_dependencies"
LOG = logging.getLogger(__name__) LOG = logging.getLogger(__name__)
# TODO: Upload Datetime objects as Timestamps instead of str # TODO: Upload Datetime objects as Timestamps instead of str
@dataclass @dataclass
class Entity(ABC): class Entity(ABC):
""" A O-Prov Entity class used to represent an Entity created by a notebook. """ A O-Prov Entity class used to represent an Entity created by a notebook.
...@@ -65,12 +66,12 @@ class Entity(ABC): ...@@ -65,12 +66,12 @@ class Entity(ABC):
self.download_provenance() self.download_provenance()
@abstractmethod @abstractmethod
def download(self) -> EntityProvenance: def download(self):
"""Download this Entity and return the attached EntityProvenance""" """Download this Entity and return the attached EntityProvenance"""
raise NotImplementedError raise NotImplementedError
@abstractmethod @abstractmethod
def upload(self, executed_file: Path, dependencies: list, started_at=datetime.now(), ended_at=datetime.now()): def upload(self, executed_file: Path, main_file: Path, dependencies: list, started_at=datetime.now(), ended_at=datetime.now()):
"""Upload this Entity""" """Upload this Entity"""
raise NotImplementedError raise NotImplementedError
...@@ -131,6 +132,8 @@ class Entity(ABC): ...@@ -131,6 +132,8 @@ class Entity(ABC):
self.id = meta.id self.id = meta.id
self.metadata = meta self.metadata = meta
LOG.info(f"Uploaded provenance information for {self.name} with id {self.id}: {self.metadata}")
def upload_dependencies(self, dependencies): def upload_dependencies(self, dependencies):
""" Upload the dependency information for this Entity. """ Upload the dependency information for this Entity.
It lists all entities, which have an id, this entity depends on. It lists all entities, which have an id, this entity depends on.
......
...@@ -15,7 +15,7 @@ class EntityProvenance: ...@@ -15,7 +15,7 @@ class EntityProvenance:
""" """
id: str | None # id of entity, always unique id: str | None # id of entity, always unique
uri: str # unique resource identifier used to locate entity (can also be used to point to table containing entity) pi: str # persistent identifier used to locate entity (can also be used to point to table containing entity)
name: str # name of specific entity describing the data it contains name: str # name of specific entity describing the data it contains
description: str # more detailed description of the enitity description: str # more detailed description of the enitity
type: str # type of entity, if notebook is run with different data type stays the same type: str # type of entity, if notebook is run with different data type stays the same
...@@ -23,9 +23,10 @@ class EntityProvenance: ...@@ -23,9 +23,10 @@ class EntityProvenance:
branch: str # the branch of the repository, makes manual search of commit easier branch: str # the branch of the repository, makes manual search of commit easier
repo_uri: str # the uri of the repository, used to locate the repository repo_uri: str # the uri of the repository, used to locate the repository
executed_file: str # path to notebook which was executed to create the entity executed_file: str # path to notebook which was executed to create the entity
started_at: datetime # start time of execution where entity was created main_file: str # path to the main file executing the notebook
ended_at: datetime # end time of execution where entity was created started_at: datetime # start time of execution where entity was created
platform: str # platform on which the entity is uploaded (e.g. dbrepo, invenio, ...) ended_at: datetime # end time of execution where entity was created
repository: str # platform on which the entity is uploaded (e.g. dbrepo, invenio, ...)
@classmethod @classmethod
def new( def new(
...@@ -33,9 +34,10 @@ class EntityProvenance: ...@@ -33,9 +34,10 @@ class EntityProvenance:
name: str, name: str,
description: str, description: str,
executed_file: Path, executed_file: Path,
main_file: Path,
type: str, type: str,
uri: str, pi: str,
platform: str, repository: str,
started_at: datetime, started_at: datetime,
ended_at: datetime ended_at: datetime
): ):
...@@ -50,20 +52,22 @@ class EntityProvenance: ...@@ -50,20 +52,22 @@ class EntityProvenance:
repo_uri = re.sub(":\d+/", "/", f"https://{repo_uri.split('@', 1)[1]}") repo_uri = re.sub(":\d+/", "/", f"https://{repo_uri.split('@', 1)[1]}")
executed_file_rel = executed_file.resolve().relative_to(BASE_PATH) executed_file_rel = executed_file.resolve().relative_to(BASE_PATH)
main_file_rel = main_file.resolve().relative_to(BASE_PATH)
return cls( return cls(
id=None, id=None,
name=name, name=name,
description=description, description=description,
uri=uri, pi=pi,
commit=commit, commit=commit,
repo_uri=repo_uri, repo_uri=repo_uri,
started_at=started_at, started_at=started_at,
ended_at=ended_at, ended_at=ended_at,
branch=branch, branch=branch,
executed_file=executed_file_rel.as_posix(), executed_file=executed_file_rel.as_posix(),
main_file=main_file_rel.as_posix(),
type=type, type=type,
platform=platform, repository=repository,
) )
@classmethod @classmethod
...@@ -72,10 +76,11 @@ class EntityProvenance: ...@@ -72,10 +76,11 @@ class EntityProvenance:
id=df["id"], id=df["id"],
name=df["name"], name=df["name"],
description=df["description"], description=df["description"],
uri=df["uri"], pi=df["pi"],
commit=df["commit"], commit=df["commit"],
repo_uri=df["git_uri"], repo_uri=df["git_uri"],
executed_file=df["executed_file"], executed_file=df["executed_file"],
main_file=df["main_file"],
started_at=datetime.strptime( started_at=datetime.strptime(
df["started_at"], "%Y-%m-%d %H:%M:%S.%f" df["started_at"], "%Y-%m-%d %H:%M:%S.%f"
), # TODO: replace with '%F %T' ), # TODO: replace with '%F %T'
...@@ -84,7 +89,7 @@ class EntityProvenance: ...@@ -84,7 +89,7 @@ class EntityProvenance:
), ),
branch=df["branch"], branch=df["branch"],
type=df["type"], type=df["type"],
platform=df["repository"], repository=df["repository"],
) )
def to_frame(self): def to_frame(self):
...@@ -93,14 +98,15 @@ class EntityProvenance: ...@@ -93,14 +98,15 @@ class EntityProvenance:
"id": pd.Series(self.id, dtype=str), "id": pd.Series(self.id, dtype=str),
"name": pd.Series(self.name, dtype=str), "name": pd.Series(self.name, dtype=str),
"description": pd.Series(self.description, dtype=str), "description": pd.Series(self.description, dtype=str),
"uri": pd.Series(self.uri, dtype=str), "pi": pd.Series(self.pi, dtype=str),
"commit": pd.Series(self.commit, dtype=str), "commit": pd.Series(self.commit, dtype=str),
"git_uri": pd.Series(self.repo_uri, dtype=str), "git_uri": pd.Series(self.repo_uri, dtype=str),
"executed_file": pd.Series(self.executed_file, dtype=str), "executed_file": pd.Series(self.executed_file, dtype=str),
"main_file": pd.Series(self.main_file, dtype=str),
"started_at": pd.Series(self.started_at, dtype=str), "started_at": pd.Series(self.started_at, dtype=str),
"ended_at": pd.Series(self.ended_at, dtype=str), "ended_at": pd.Series(self.ended_at, dtype=str),
"branch": pd.Series(self.branch, dtype=str), "branch": pd.Series(self.branch, dtype=str),
"type": pd.Series(self.type, dtype=str), "type": pd.Series(self.type, dtype=str),
"repository": pd.Series(self.platform, dtype=str), "repository": pd.Series(self.repository, dtype=str),
} }
) )
...@@ -3,14 +3,14 @@ from datetime import datetime ...@@ -3,14 +3,14 @@ from datetime import datetime
from pathlib import Path from pathlib import Path
from fairnb.api.dbrepo import DBRepoConnector from fairnb.api.dbrepo import DBRepoConnector
from fairnb.api.invenio import InvenioManager, InvenioConnector from fairnb.api.invenio import InvenioRDMManager, InvenioRDMConnector
from fairnb.entity.entity import Entity from fairnb.entity.entity import Entity
from fairnb.entity.entity_provenance import EntityProvenance from fairnb.entity.entity_provenance import EntityProvenance
@dataclass @dataclass
class InvenioEntity(Entity): class InvenioRDMEntity(Entity):
invenio_manager: InvenioManager = field(init=True, default=None) invenio_manager: InvenioRDMManager = field(init=True, default=None)
record_metadata: dict = field(init=True, default=None) record_metadata: dict = field(init=True, default=None)
publish_record: bool = field(init=True, default=False) publish_record: bool = field(init=True, default=False)
platform: str = field(init=False, default="https://doi.org/10.17616/R31NJMYD") platform: str = field(init=False, default="https://doi.org/10.17616/R31NJMYD")
...@@ -24,11 +24,11 @@ class InvenioEntity(Entity): ...@@ -24,11 +24,11 @@ class InvenioEntity(Entity):
description: str, description: str,
type: str, type: str,
dbrepo_connector: DBRepoConnector, dbrepo_connector: DBRepoConnector,
invenio_connector: InvenioConnector, invenio_connector: InvenioRDMConnector,
publish_record: bool = False, publish_record: bool = False,
): ):
return cls( return cls(
invenio_manager=InvenioManager(invenio_connector), invenio_manager=InvenioRDMManager(invenio_connector),
record_metadata=record_metadata, record_metadata=record_metadata,
dbrepo_connector=dbrepo_connector, dbrepo_connector=dbrepo_connector,
location=location, location=location,
...@@ -44,13 +44,13 @@ class InvenioEntity(Entity): ...@@ -44,13 +44,13 @@ class InvenioEntity(Entity):
id: str, id: str,
location: Path, location: Path,
dbrepo_connector: DBRepoConnector, dbrepo_connector: DBRepoConnector,
invenio_connector: InvenioConnector, invenio_connector: InvenioRDMConnector,
): ):
return cls( return cls(
id=id, id=id,
location=location, location=location,
dbrepo_connector=dbrepo_connector, dbrepo_connector=dbrepo_connector,
invenio_manager=InvenioManager(invenio_connector) invenio_manager=InvenioRDMManager(invenio_connector)
) )
def __post_init__(self): def __post_init__(self):
...@@ -60,9 +60,10 @@ class InvenioEntity(Entity): ...@@ -60,9 +60,10 @@ class InvenioEntity(Entity):
assert self.record_metadata is not None assert self.record_metadata is not None
return return
self.invenio_manager.record_id = self.metadata.uri.split('/')[-1] self.invenio_manager.record_id = self.metadata.pi.split('/')[-1]
def upload(self, executed_file: Path, dependencies: list[Entity] = None, started_at=datetime.now(), ended_at=datetime.now()): def upload(self, executed_file: Path, main_file: Path,
dependencies: list[Entity] = None, started_at=datetime.now(), ended_at=datetime.now()):
dir_path: Path dir_path: Path
regex: str regex: str
...@@ -91,9 +92,10 @@ class InvenioEntity(Entity): ...@@ -91,9 +92,10 @@ class InvenioEntity(Entity):
name=self.name, name=self.name,
description=self.description, description=self.description,
executed_file=executed_file, executed_file=executed_file,
uri=uri.replace('/api', ''), main_file=main_file,
pi=uri.replace('/api', ''),
type=self.type, type=self.type,
platform=self.platform, repository=self.platform,
started_at=started_at, started_at=started_at,
ended_at=ended_at, ended_at=ended_at,
) )
......
...@@ -73,6 +73,7 @@ class Executor: ...@@ -73,6 +73,7 @@ class Executor:
# use inspect to get path of caller # use inspect to get path of caller
entity.upload( entity.upload(
nb_config.nb_location, nb_config.nb_location,
nb_config.main_location,
nb_config.dependencies, nb_config.dependencies,
nb_config.started_at, nb_config.started_at,
nb_config.ended_at nb_config.ended_at
......
...@@ -8,6 +8,7 @@ from fairnb.entity.entity import Entity ...@@ -8,6 +8,7 @@ from fairnb.entity.entity import Entity
@dataclass @dataclass
class NbConfig: class NbConfig:
nb_location: Path nb_location: Path
main_location: Path
entities: list[Entity] entities: list[Entity]
dependencies: list[Entity] dependencies: list[Entity]
nb_output_location: Path = field(init=True, default=None) nb_output_location: Path = field(init=True, default=None)
......
...@@ -5,7 +5,7 @@ import pandas as pd ...@@ -5,7 +5,7 @@ import pandas as pd
import tarfile import tarfile
from fairnb.api.dbrepo import DBRepoConnector from fairnb.api.dbrepo import DBRepoConnector
from fairnb.api.invenio import InvenioManager, InvenioConnector from fairnb.api.invenio import InvenioRDMManager, InvenioRDMConnector
from definitions import CONFIG_PATH from definitions import CONFIG_PATH
import yaml import yaml
...@@ -46,14 +46,14 @@ class Util: ...@@ -46,14 +46,14 @@ class Util:
def get_invenio_connector(self, path: pathlib.Path = None): def get_invenio_connector(self, path: pathlib.Path = None):
config = self.get_config(path=path) config = self.get_config(path=path)
return InvenioConnector( return InvenioRDMConnector(
token=config["credentials"]["token"], token=config["credentials"]["token"],
host=config["host"] host=config["host"]
) )
def get_invenio_manager(self, path: pathlib.Path = None): def get_invenio_manager(self, path: pathlib.Path = None):
config = self.get_config(path=path) config = self.get_config(path=path)
return InvenioManager( return InvenioRDMManager(
self.get_invenio_connector(path=path) self.get_invenio_connector(path=path)
) )
......
%% Cell type:markdown id:4389a8092677254e tags: %% Cell type:markdown id:4389a8092677254e tags:
# Audio Files # Audio Files
Bundle the provided audio files (400, in MP3) in a tar, encrypt it using gzip and store it in the output folder. Bundle the provided audio files (400, in MP3) in a tar, encrypt it using gzip and store it in the output folder.
%% Cell type:code id:87ab37c6 tags: %% Cell type:code id:87ab37c6 tags:
``` python ``` python
from definitions import BASE_PATH from definitions import BASE_PATH
import tarfile import tarfile
import zipfile import zipfile
import os import os
from pathlib import Path from pathlib import Path
``` ```
%% Cell type:code id:1b4e6b01 tags:parameters %% Cell type:code id:1b4e6b01 tags:parameters
``` python ``` python
# Parameters # Parameters
INPUT_PATHS = {} INPUT_PATHS = {}
OUTPUT_PATHS = { OUTPUT_PATHS = {
"audio_tar": str(BASE_PATH / "tmp/1_audio_files/output/emotifymusic.tar.gz") "audio_tar": str(BASE_PATH / "tmp/1_audio_files/output/emotifymusic.tar.gz")
} }
``` ```
%% Cell type:code id:15dea136 tags:injected-parameters %% Cell type:code id:1a6df3b0 tags:injected-parameters
``` python ``` python
# Parameters # Parameters
INPUT_PATHS = {} INPUT_PATHS = {}
OUTPUT_PATHS = { OUTPUT_PATHS = {
"audio_tar": "/home/lukas/Programming/uni/bachelorarbeit/fairnb/tmp/1_audio_files/output/emotifymusic.tar.gz" "audio_tar": "/home/lukas/Programming/uni/bachelorarbeit/dbrepo-ismir/tmp/1_audio_files/output/emotifymusic.tar.gz"
} }
``` ```
%% Cell type:code id:1e487573 tags: %% Cell type:code id:1e487573 tags:
``` python ``` python
# load provided files # load provided files
zip_path = BASE_PATH / "resource" / "1_audio_files" / "emotifymusic.zip" zip_path = BASE_PATH / "resource" / "1_audio_files" / "emotifymusic.zip"
dir_path = BASE_PATH / "tmp" / "1_audio_files" / "music" dir_path = BASE_PATH / "tmp" / "1_audio_files" / "music"
dir_path.mkdir(parents=True, exist_ok=True) dir_path.mkdir(parents=True, exist_ok=True)
# unzip to dir_path # unzip to dir_path
with zipfile.ZipFile(zip_path, "r") as zfile: with zipfile.ZipFile(zip_path, "r") as zfile:
zfile.extractall(path=dir_path) zfile.extractall(path=dir_path)
``` ```
%% Cell type:code id:c3193f35 tags: %% Cell type:code id:c3193f35 tags:
``` python ``` python
file_paths = list(dir_path.rglob('**/*.*')) file_paths = list(dir_path.rglob('**/*.*'))
flattened_dir_path = BASE_PATH / "tmp" / "1_audio_files" / "flattened" flattened_dir_path = BASE_PATH / "tmp" / "1_audio_files" / "flattened"
flattened_dir_path.mkdir(parents=True, exist_ok=True) flattened_dir_path.mkdir(parents=True, exist_ok=True)
for path in file_paths: for path in file_paths:
(flattened_dir_path / path.relative_to(dir_path).as_posix().replace('/', '_')).write_bytes(path.read_bytes()) (flattened_dir_path / path.relative_to(dir_path).as_posix().replace('/', '_')).write_bytes(path.read_bytes())
``` ```
%% Cell type:code id:3272ea2b tags: %% Cell type:code id:3272ea2b tags:
``` python ``` python
tar_path = Path(OUTPUT_PATHS["audio_tar"]) tar_path = Path(OUTPUT_PATHS["audio_tar"])
tar_path.parent.mkdir(parents=True, exist_ok=True) tar_path.parent.mkdir(parents=True, exist_ok=True)
with tarfile.open(tar_path, "w:gz") as file: with tarfile.open(tar_path, "w:gz") as file:
file.add(flattened_dir_path, arcname=os.path.sep) file.add(flattened_dir_path, arcname=os.path.sep)
``` ```
......
This diff is collapsed.
This diff is collapsed.
%% Cell type:markdown id:e92b4fe9 tags: %% Cell type:markdown id:e92b4fe9 tags:
# Split the Features into Train and Test Set # Split the Features into Train and Test Set
%% Cell type:code id:5f1fae44 tags: %% Cell type:code id:5f1fae44 tags:
``` python ``` python
import pandas as pd import pandas as pd
from pathlib import Path from pathlib import Path
from definitions import BASE_PATH from definitions import BASE_PATH
``` ```
%% Cell type:code id:01de1b27 tags:parameters %% Cell type:code id:01de1b27 tags:parameters
``` python ``` python
# Tagged with 'parameters' # Tagged with 'parameters'
from definitions import BASE_PATH from definitions import BASE_PATH
INPUT_PATHS: dict[str, str] = { INPUT_PATHS: dict[str, str] = {
"features": (BASE_PATH / "tmp" / "4_split" / "input" / "features.csv").__str__() "features": (BASE_PATH / "tmp" / "4_split" / "input" / "features.csv").__str__()
} }
OUTPUT_PATHS: dict[str, str] = { OUTPUT_PATHS: dict[str, str] = {
"split": (BASE_PATH / "tmp" / "4_split" / "output" / "split.csv").__str__() "split": (BASE_PATH / "tmp" / "4_split" / "output" / "split.csv").__str__()
} }
``` ```
%% Cell type:code id:e99ca0ba tags:injected-parameters %% Cell type:code id:fdc0a0a6 tags:injected-parameters
``` python ``` python
# Parameters # Parameters
INPUT_PATHS = { INPUT_PATHS = {
"aggregated_features": "/home/lukas/Programming/uni/bachelorarbeit/fairnb/tmp/4_split/input/features.csv" "aggregated_features": "/home/lukas/Programming/uni/bachelorarbeit/dbrepo-ismir/tmp/4_split/input/features.csv"
} }
OUTPUT_PATHS = { OUTPUT_PATHS = {
"split": "/home/lukas/Programming/uni/bachelorarbeit/fairnb/tmp/4_split/output/split.csv" "split": "/home/lukas/Programming/uni/bachelorarbeit/dbrepo-ismir/tmp/4_split/output/split.csv"
} }
``` ```
%% Cell type:code id:a4cc6800 tags: %% Cell type:code id:a4cc6800 tags:
``` python ``` python
# INPUT # INPUT
for path in INPUT_PATHS.values(): for path in INPUT_PATHS.values():
assert Path(path).exists() assert Path(path).exists()
features = pd.read_csv(INPUT_PATHS["aggregated_features"]) features = pd.read_csv(INPUT_PATHS["aggregated_features"])
``` ```
%% Cell type:code id:a186d0c4 tags: %% Cell type:code id:a186d0c4 tags:
``` python ``` python
train = features.sample(frac=0.8).sort_index() train = features.sample(frac=0.8, random_state=11908553).sort_index()
test = features.drop(train.index) test = features.drop(train.index)
split_true = pd.DataFrame({ split_true = pd.DataFrame({
"filename": train.filename, "filename": train.filename,
"train": True "train": True
}) })
split_false = pd.DataFrame({ split_false = pd.DataFrame({
"filename": test.filename, "filename": test.filename,
"train": False "train": False
}) })
split_concat = pd.concat([split_true, split_false])\ split_concat = pd.concat([split_true, split_false])\
.sort_values("filename")\ .sort_values("filename")\
.reset_index(drop=True) .reset_index(drop=True)
``` ```
%% Cell type:code id:091e0641 tags: %% Cell type:code id:091e0641 tags:
``` python ``` python
split_concat split_concat
``` ```
%% Output
filename train
0 classical_1.mp3 False
1 classical_10.mp3 True
2 classical_100.mp3 False
3 classical_11.mp3 True
4 classical_12.mp3 True
.. ... ...
395 rock_95.mp3 False
396 rock_96.mp3 True
397 rock_97.mp3 True
398 rock_98.mp3 True
399 rock_99.mp3 True
[400 rows x 2 columns]
%% Cell type:code id:7b11b8bb tags: %% Cell type:code id:7b11b8bb tags:
``` python ``` python
# output # output
OUTPUT_PATH = Path(OUTPUT_PATHS["split"]) OUTPUT_PATH = Path(OUTPUT_PATHS["split"])
OUTPUT_PATH.parent.mkdir(parents=True, exist_ok=True) OUTPUT_PATH.parent.mkdir(parents=True, exist_ok=True)
output = split_concat output = split_concat
output.to_csv(OUTPUT_PATH, index=False) output.to_csv(OUTPUT_PATH, index=False)
``` ```
......
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
...@@ -123,13 +123,13 @@ files = [ ...@@ -123,13 +123,13 @@ files = [
[[package]] [[package]]
name = "anyio" name = "anyio"
version = "4.2.0" version = "4.3.0"
description = "High level compatibility layer for multiple asynchronous event loop implementations" description = "High level compatibility layer for multiple asynchronous event loop implementations"
optional = false optional = false
python-versions = ">=3.8" python-versions = ">=3.8"
files = [ files = [
{file = "anyio-4.2.0-py3-none-any.whl", hash = "sha256:745843b39e829e108e518c489b31dc757de7d2131d53fac32bd8df268227bfee"}, {file = "anyio-4.3.0-py3-none-any.whl", hash = "sha256:048e05d0f6caeed70d731f3db756d35dcc1f35747c8c403364a8332c630441b8"},
{file = "anyio-4.2.0.tar.gz", hash = "sha256:e1875bb4b4e2de1669f4bc7869b6d3f54231cdced71605e6e64c9be77e3be50f"}, {file = "anyio-4.3.0.tar.gz", hash = "sha256:f75253795a87df48568485fd18cdd2a3fa5c4f7c5be8e5e36637733fce06fed6"},
] ]
[package.dependencies] [package.dependencies]
...@@ -2149,13 +2149,13 @@ test = ["flaky", "ipykernel (>=6.19.3)", "ipython", "ipywidgets", "nbconvert (>= ...@@ -2149,13 +2149,13 @@ test = ["flaky", "ipykernel (>=6.19.3)", "ipython", "ipywidgets", "nbconvert (>=
[[package]] [[package]]
name = "nbconvert" name = "nbconvert"
version = "7.16.0" version = "7.16.1"
description = "Converting Jupyter Notebooks" description = "Converting Jupyter Notebooks (.ipynb files) to other formats. Output formats include asciidoc, html, latex, markdown, pdf, py, rst, script. nbconvert can be used both as a Python library (`import nbconvert`) or as a command line tool (invoked as `jupyter nbconvert ...`)."
optional = false optional = false
python-versions = ">=3.8" python-versions = ">=3.8"
files = [ files = [
{file = "nbconvert-7.16.0-py3-none-any.whl", hash = "sha256:ad3dc865ea6e2768d31b7eb6c7ab3be014927216a5ece3ef276748dd809054c7"}, {file = "nbconvert-7.16.1-py3-none-any.whl", hash = "sha256:3188727dffadfdc9c6a1c7250729063d7bc78b355ad7aa023138afa030d1cd07"},
{file = "nbconvert-7.16.0.tar.gz", hash = "sha256:813e6553796362489ae572e39ba1bff978536192fb518e10826b0e8cadf03ec8"}, {file = "nbconvert-7.16.1.tar.gz", hash = "sha256:e79e6a074f49ba3ed29428ed86487bf51509d9aab613bd8522ac08f6d28fd7fd"},
] ]
[package.dependencies] [package.dependencies]
...@@ -2917,19 +2917,23 @@ typing-extensions = ">=4.6.0,<4.7.0 || >4.7.0" ...@@ -2917,19 +2917,23 @@ typing-extensions = ">=4.6.0,<4.7.0 || >4.7.0"
[[package]] [[package]]
name = "pydantic-settings" name = "pydantic-settings"
version = "2.1.0" version = "2.2.0"
description = "Settings management using Pydantic" description = "Settings management using Pydantic"
optional = false optional = false
python-versions = ">=3.8" python-versions = ">=3.8"
files = [ files = [
{file = "pydantic_settings-2.1.0-py3-none-any.whl", hash = "sha256:7621c0cb5d90d1140d2f0ef557bdf03573aac7035948109adf2574770b77605a"}, {file = "pydantic_settings-2.2.0-py3-none-any.whl", hash = "sha256:5f7bcaf9ad4419559dc5ac155c0324a9aeb2547c60471ee7c7d026f467a6b515"},
{file = "pydantic_settings-2.1.0.tar.gz", hash = "sha256:26b1492e0a24755626ac5e6d715e9077ab7ad4fb5f19a8b7ed7011d52f36141c"}, {file = "pydantic_settings-2.2.0.tar.gz", hash = "sha256:648d0a76673e69c51278979cba2e83cf16a23d57519bfd7e553d1c3f37db5560"},
] ]
[package.dependencies] [package.dependencies]
pydantic = ">=2.3.0" pydantic = ">=2.3.0"
python-dotenv = ">=0.21.0" python-dotenv = ">=0.21.0"
[package.extras]
toml = ["tomlkit (>=0.12)"]
yaml = ["pyyaml (>=6.0.1)"]
[[package]] [[package]]
name = "pygments" name = "pygments"
version = "2.17.2" version = "2.17.2"
...@@ -3961,13 +3965,13 @@ dev = ["flake8", "flake8-annotations", "flake8-bandit", "flake8-bugbear", "flake ...@@ -3961,13 +3965,13 @@ dev = ["flake8", "flake8-annotations", "flake8-bandit", "flake8-bugbear", "flake
[[package]] [[package]]
name = "urllib3" name = "urllib3"
version = "2.2.0" version = "2.2.1"
description = "HTTP library with thread-safe connection pooling, file post, and more." description = "HTTP library with thread-safe connection pooling, file post, and more."
optional = false optional = false
python-versions = ">=3.8" python-versions = ">=3.8"
files = [ files = [
{file = "urllib3-2.2.0-py3-none-any.whl", hash = "sha256:ce3711610ddce217e6d113a2732fafad960a03fd0318c91faa79481e35c11224"}, {file = "urllib3-2.2.1-py3-none-any.whl", hash = "sha256:450b20ec296a467077128bff42b73080516e71b56ff59a60a02bef2232c4fa9d"},
{file = "urllib3-2.2.0.tar.gz", hash = "sha256:051d961ad0c62a94e50ecf1af379c3aba230c66c710493493560c0c223c49f20"}, {file = "urllib3-2.2.1.tar.gz", hash = "sha256:d0570876c61ab9e520d776c38acbbb5b05a776d3f9ff98a5c8fd5162a444cf19"},
] ]
[package.extras] [package.extras]
......
access: access:
files: public files: restricted
record: public record: public
files: files:
default_preview: null default_preview: null
...@@ -24,13 +24,13 @@ metadata: ...@@ -24,13 +24,13 @@ metadata:
publisher: TU Wien publisher: TU Wien
related_identifiers: related_identifiers:
- identifier: https://www2.projects.science.uu.nl/memotion/emotifydata/ - identifier: https://www2.projects.science.uu.nl/memotion/emotifydata/
relation: relation_type:
id: isderivedfrom id: isderivedfrom
resource_type: resource_type:
id: sound id: sound
scheme: url scheme: url
- identifier: https://gitlab.tuwien.ac.at/martin.weise/fairnb - identifier: https://gitlab.tuwien.ac.at/martin.weise/fairnb
relation: relation_type:
id: isderivedfrom id: isderivedfrom
resource_type: resource_type:
id: software id: software
......