*** Wartungsfenster jeden ersten Mittwoch vormittag im Monat ***

Skip to content
Snippets Groups Projects

Compare revisions

Changes are shown as if the source revision was being merged into the target revision. Learn more about comparing revisions.

Source

Select target project
No results found

Target

Select target project
  • martin.weise/fairnb
1 result
Show changes
Commits on Source (6)
Showing
with 694 additions and 4228 deletions
host: https://dbrepo1.ec.tuwien.ac.at
database-id: <insert database id>
database-id: 23
credentials:
username: <insert username from dbrepo>
password: <insert password from dbrepo>
......
host: https://researchdata.tuwien.ac.at
credentials:
token: <insert token from invenio>
\ No newline at end of file
token: <insert token of InvenioRDM API>
\ No newline at end of file
......@@ -12,7 +12,7 @@ from keycloak import KeycloakOpenID
LOG = logging.getLogger(__name__)
TIMEOUT = 600
CHUNK_SIZE = 1024 * 1024 * 100
CHUNK_SIZE = 1024 * 1024 * 20
def re_auth(func: Callable) -> Callable:
@wraps(func)
......@@ -210,10 +210,10 @@ class DBRepoConnector:
return None
@re_auth
def create_table(self, dataframe: pd.DataFrame, table_name: str, table_descriptor: str):
def create_table(self, dataframe: pd.DataFrame, table_name: str, table_description: str):
""" Creates a new table """
data = self._create_table_data(dataframe, table_name, table_descriptor)
data = self._create_table_data(dataframe, table_name, table_description)
response = requests.post(
f"{self.host}/api/database/{self.database_id}/table",
......@@ -234,10 +234,10 @@ class DBRepoConnector:
def create_table_if_not_exists(self,
dataframe: pd.DataFrame,
table_name: str,
table_descriptor: str
table_description: str
):
table = table if (table := self.get_table(table_name)) is not None else \
self.create_table(dataframe, table_name, table_descriptor)
self.create_table(dataframe, table_name, table_description)
return table
......@@ -269,6 +269,7 @@ class DBRepoConnector:
"quote": '"',
"separator": ",",
"skip_lines": 1,
"line_termination": "\n",
"true_element": "True"
},
headers=self.headers
......
......@@ -8,7 +8,7 @@ import requests as rq
log = logging.getLogger(__name__)
class InvenioConnector:
class InvenioRDMConnector:
def __init__(self, token: str, host: str | None = None):
self.host = host or "https://test.researchdata.tuwien.ac.at"
self.token = token
......@@ -130,14 +130,14 @@ class InvenioConnector:
executor.map(lambda p: self.download_file(*p), args)
class InvenioManager:
class InvenioRDMManager:
"""A high level interface to up- and download files from invenio.
Utilizes state management:
1. record not assigned
2. record assigned
3. record published"""
def __init__(self, invenio_connector: InvenioConnector, record_id: str | None = None):
def __init__(self, invenio_connector: InvenioRDMConnector, record_id: str | None = None):
self.connector = invenio_connector
self.record_id = record_id
self.doi = None
......
......@@ -9,7 +9,7 @@ from fairnb.entity.entity_provenance import EntityProvenance
@dataclass
class DbRepoEntity(Entity):
class DBRepoEntity(Entity):
table_name: str = field(init=True, default=None)
table_description: str = field(init=True, default="")
table_id: int = field(init=False, default=None)
......@@ -19,7 +19,7 @@ class DbRepoEntity(Entity):
super().__post_init__()
if self.metadata is not None: # equivalent to: self.id is not None
self.table_id = int(self.metadata.uri.split("/")[-1])
self.table_id = int(self.metadata.pi.split("/")[-1])
else:
assert self.table_name is not None # has to exist fot the ability to get table_id
......@@ -60,9 +60,10 @@ class DbRepoEntity(Entity):
self.location.resolve().parent.mkdir(parents=True, exist_ok=True)
df.to_csv(self.location, index=False)
def upload(self, executed_file: Path, dependencies: list[Entity] = None,
def upload(self, executed_file: Path, main_file: Path,
dependencies: list[Entity] = None,
start_time: datetime = datetime.now(),
end_time: datetime = datetime.now()):
end_time: datetime = datetime.now()) -> EntityProvenance:
df = pd.read_csv(self.location)
# add id column to df:
......@@ -80,10 +81,11 @@ class DbRepoEntity(Entity):
self.name,
self.description,
executed_file=executed_file,
uri=f"{self.dbrepo_connector.host}/database/"
main_file=main_file,
pi=f"{self.dbrepo_connector.host}/database/"
f"{self.dbrepo_connector.database_id}/table/{self.table_id}",
type=self.type,
platform=self.repository,
repository=self.repository,
started_at=start_time,
ended_at=end_time
)
......@@ -98,6 +100,8 @@ class DbRepoEntity(Entity):
self.upload_dependencies(dependencies)
return self.metadata
def upload_data(self, df: pd.DataFrame):
assert self.id is not None
assert self.table_id is not None
......
......@@ -11,12 +11,13 @@ from fairnb.api.dbrepo import DBRepoConnector
from fairnb.entity.entity_provenance import EntityProvenance
PROVENANCE_TABLE_NAME = "entity_provenance_test3"
DEPENDENCY_TABLE_NAME = "entity_dependencies_test3"
PROVENANCE_TABLE_NAME = "entity_provenance"
DEPENDENCY_TABLE_NAME = "entity_dependencies"
LOG = logging.getLogger(__name__)
# TODO: Upload Datetime objects as Timestamps instead of str
@dataclass
class Entity(ABC):
""" A O-Prov Entity class used to represent an Entity created by a notebook.
......@@ -65,12 +66,12 @@ class Entity(ABC):
self.download_provenance()
@abstractmethod
def download(self) -> EntityProvenance:
def download(self):
"""Download this Entity and return the attached EntityProvenance"""
raise NotImplementedError
@abstractmethod
def upload(self, executed_file: Path, dependencies: list, started_at=datetime.now(), ended_at=datetime.now()):
def upload(self, executed_file: Path, main_file: Path, dependencies: list, started_at=datetime.now(), ended_at=datetime.now()):
"""Upload this Entity"""
raise NotImplementedError
......@@ -131,6 +132,8 @@ class Entity(ABC):
self.id = meta.id
self.metadata = meta
LOG.info(f"Uploaded provenance information for {self.name} with id {self.id}: {self.metadata}")
def upload_dependencies(self, dependencies):
""" Upload the dependency information for this Entity.
It lists all entities, which have an id, this entity depends on.
......
......@@ -15,7 +15,7 @@ class EntityProvenance:
"""
id: str | None # id of entity, always unique
uri: str # unique resource identifier used to locate entity (can also be used to point to table containing entity)
pi: str # persistent identifier used to locate entity (can also be used to point to table containing entity)
name: str # name of specific entity describing the data it contains
description: str # more detailed description of the enitity
type: str # type of entity, if notebook is run with different data type stays the same
......@@ -23,9 +23,10 @@ class EntityProvenance:
branch: str # the branch of the repository, makes manual search of commit easier
repo_uri: str # the uri of the repository, used to locate the repository
executed_file: str # path to notebook which was executed to create the entity
started_at: datetime # start time of execution where entity was created
ended_at: datetime # end time of execution where entity was created
platform: str # platform on which the entity is uploaded (e.g. dbrepo, invenio, ...)
main_file: str # path to the main file executing the notebook
started_at: datetime # start time of execution where entity was created
ended_at: datetime # end time of execution where entity was created
repository: str # platform on which the entity is uploaded (e.g. dbrepo, invenio, ...)
@classmethod
def new(
......@@ -33,9 +34,10 @@ class EntityProvenance:
name: str,
description: str,
executed_file: Path,
main_file: Path,
type: str,
uri: str,
platform: str,
pi: str,
repository: str,
started_at: datetime,
ended_at: datetime
):
......@@ -50,20 +52,22 @@ class EntityProvenance:
repo_uri = re.sub(":\d+/", "/", f"https://{repo_uri.split('@', 1)[1]}")
executed_file_rel = executed_file.resolve().relative_to(BASE_PATH)
main_file_rel = main_file.resolve().relative_to(BASE_PATH)
return cls(
id=None,
name=name,
description=description,
uri=uri,
pi=pi,
commit=commit,
repo_uri=repo_uri,
started_at=started_at,
ended_at=ended_at,
branch=branch,
executed_file=executed_file_rel.as_posix(),
main_file=main_file_rel.as_posix(),
type=type,
platform=platform,
repository=repository,
)
@classmethod
......@@ -72,10 +76,11 @@ class EntityProvenance:
id=df["id"],
name=df["name"],
description=df["description"],
uri=df["uri"],
pi=df["pi"],
commit=df["commit"],
repo_uri=df["git_uri"],
executed_file=df["executed_file"],
main_file=df["main_file"],
started_at=datetime.strptime(
df["started_at"], "%Y-%m-%d %H:%M:%S.%f"
), # TODO: replace with '%F %T'
......@@ -84,7 +89,7 @@ class EntityProvenance:
),
branch=df["branch"],
type=df["type"],
platform=df["repository"],
repository=df["repository"],
)
def to_frame(self):
......@@ -93,14 +98,15 @@ class EntityProvenance:
"id": pd.Series(self.id, dtype=str),
"name": pd.Series(self.name, dtype=str),
"description": pd.Series(self.description, dtype=str),
"uri": pd.Series(self.uri, dtype=str),
"pi": pd.Series(self.pi, dtype=str),
"commit": pd.Series(self.commit, dtype=str),
"git_uri": pd.Series(self.repo_uri, dtype=str),
"executed_file": pd.Series(self.executed_file, dtype=str),
"main_file": pd.Series(self.main_file, dtype=str),
"started_at": pd.Series(self.started_at, dtype=str),
"ended_at": pd.Series(self.ended_at, dtype=str),
"branch": pd.Series(self.branch, dtype=str),
"type": pd.Series(self.type, dtype=str),
"repository": pd.Series(self.platform, dtype=str),
"repository": pd.Series(self.repository, dtype=str),
}
)
......@@ -3,14 +3,14 @@ from datetime import datetime
from pathlib import Path
from fairnb.api.dbrepo import DBRepoConnector
from fairnb.api.invenio import InvenioManager, InvenioConnector
from fairnb.api.invenio import InvenioRDMManager, InvenioRDMConnector
from fairnb.entity.entity import Entity
from fairnb.entity.entity_provenance import EntityProvenance
@dataclass
class InvenioEntity(Entity):
invenio_manager: InvenioManager = field(init=True, default=None)
class InvenioRDMEntity(Entity):
invenio_manager: InvenioRDMManager = field(init=True, default=None)
record_metadata: dict = field(init=True, default=None)
publish_record: bool = field(init=True, default=False)
platform: str = field(init=False, default="https://doi.org/10.17616/R31NJMYD")
......@@ -24,11 +24,11 @@ class InvenioEntity(Entity):
description: str,
type: str,
dbrepo_connector: DBRepoConnector,
invenio_connector: InvenioConnector,
invenio_connector: InvenioRDMConnector,
publish_record: bool = False,
):
return cls(
invenio_manager=InvenioManager(invenio_connector),
invenio_manager=InvenioRDMManager(invenio_connector),
record_metadata=record_metadata,
dbrepo_connector=dbrepo_connector,
location=location,
......@@ -44,13 +44,13 @@ class InvenioEntity(Entity):
id: str,
location: Path,
dbrepo_connector: DBRepoConnector,
invenio_connector: InvenioConnector,
invenio_connector: InvenioRDMConnector,
):
return cls(
id=id,
location=location,
dbrepo_connector=dbrepo_connector,
invenio_manager=InvenioManager(invenio_connector)
invenio_manager=InvenioRDMManager(invenio_connector)
)
def __post_init__(self):
......@@ -60,9 +60,10 @@ class InvenioEntity(Entity):
assert self.record_metadata is not None
return
self.invenio_manager.record_id = self.metadata.uri.split('/')[-1]
self.invenio_manager.record_id = self.metadata.pi.split('/')[-1]
def upload(self, executed_file: Path, dependencies: list[Entity] = None, started_at=datetime.now(), ended_at=datetime.now()):
def upload(self, executed_file: Path, main_file: Path,
dependencies: list[Entity] = None, started_at=datetime.now(), ended_at=datetime.now()):
dir_path: Path
regex: str
......@@ -91,9 +92,10 @@ class InvenioEntity(Entity):
name=self.name,
description=self.description,
executed_file=executed_file,
uri=uri.replace('/api', ''),
main_file=main_file,
pi=uri.replace('/api', ''),
type=self.type,
platform=self.platform,
repository=self.platform,
started_at=started_at,
ended_at=ended_at,
)
......
......@@ -73,6 +73,7 @@ class Executor:
# use inspect to get path of caller
entity.upload(
nb_config.nb_location,
nb_config.main_location,
nb_config.dependencies,
nb_config.started_at,
nb_config.ended_at
......
......@@ -8,6 +8,7 @@ from fairnb.entity.entity import Entity
@dataclass
class NbConfig:
nb_location: Path
main_location: Path
entities: list[Entity]
dependencies: list[Entity]
nb_output_location: Path = field(init=True, default=None)
......
......@@ -5,7 +5,7 @@ import pandas as pd
import tarfile
from fairnb.api.dbrepo import DBRepoConnector
from fairnb.api.invenio import InvenioManager, InvenioConnector
from fairnb.api.invenio import InvenioRDMManager, InvenioRDMConnector
from definitions import CONFIG_PATH
import yaml
......@@ -46,14 +46,14 @@ class Util:
def get_invenio_connector(self, path: pathlib.Path = None):
config = self.get_config(path=path)
return InvenioConnector(
return InvenioRDMConnector(
token=config["credentials"]["token"],
host=config["host"]
)
def get_invenio_manager(self, path: pathlib.Path = None):
config = self.get_config(path=path)
return InvenioManager(
return InvenioRDMManager(
self.get_invenio_connector(path=path)
)
......
%% Cell type:markdown id:4389a8092677254e tags:
# Audio Files
Bundle the provided audio files (400, in MP3) in a tar, encrypt it using gzip and store it in the output folder.
%% Cell type:code id:87ab37c6 tags:
``` python
from definitions import BASE_PATH
import tarfile
import zipfile
import os
from pathlib import Path
```
%% Cell type:code id:1b4e6b01 tags:parameters
``` python
# Parameters
INPUT_PATHS = {}
OUTPUT_PATHS = {
"audio_tar": str(BASE_PATH / "tmp/1_audio_files/output/emotifymusic.tar.gz")
}
```
%% Cell type:code id:15dea136 tags:injected-parameters
%% Cell type:code id:1a6df3b0 tags:injected-parameters
``` python
# Parameters
INPUT_PATHS = {}
OUTPUT_PATHS = {
"audio_tar": "/home/lukas/Programming/uni/bachelorarbeit/fairnb/tmp/1_audio_files/output/emotifymusic.tar.gz"
"audio_tar": "/home/lukas/Programming/uni/bachelorarbeit/dbrepo-ismir/tmp/1_audio_files/output/emotifymusic.tar.gz"
}
```
%% Cell type:code id:1e487573 tags:
``` python
# load provided files
zip_path = BASE_PATH / "resource" / "1_audio_files" / "emotifymusic.zip"
dir_path = BASE_PATH / "tmp" / "1_audio_files" / "music"
dir_path.mkdir(parents=True, exist_ok=True)
# unzip to dir_path
with zipfile.ZipFile(zip_path, "r") as zfile:
zfile.extractall(path=dir_path)
```
%% Cell type:code id:c3193f35 tags:
``` python
file_paths = list(dir_path.rglob('**/*.*'))
flattened_dir_path = BASE_PATH / "tmp" / "1_audio_files" / "flattened"
flattened_dir_path.mkdir(parents=True, exist_ok=True)
for path in file_paths:
(flattened_dir_path / path.relative_to(dir_path).as_posix().replace('/', '_')).write_bytes(path.read_bytes())
```
%% Cell type:code id:3272ea2b tags:
``` python
tar_path = Path(OUTPUT_PATHS["audio_tar"])
tar_path.parent.mkdir(parents=True, exist_ok=True)
with tarfile.open(tar_path, "w:gz") as file:
file.add(flattened_dir_path, arcname=os.path.sep)
```
......
This diff is collapsed.
This diff is collapsed.
%% Cell type:markdown id:e92b4fe9 tags:
# Split the Features into Train and Test Set
%% Cell type:code id:5f1fae44 tags:
``` python
import pandas as pd
from pathlib import Path
from definitions import BASE_PATH
```
%% Cell type:code id:01de1b27 tags:parameters
``` python
# Tagged with 'parameters'
from definitions import BASE_PATH
INPUT_PATHS: dict[str, str] = {
"features": (BASE_PATH / "tmp" / "4_split" / "input" / "features.csv").__str__()
}
OUTPUT_PATHS: dict[str, str] = {
"split": (BASE_PATH / "tmp" / "4_split" / "output" / "split.csv").__str__()
}
```
%% Cell type:code id:e99ca0ba tags:injected-parameters
%% Cell type:code id:fdc0a0a6 tags:injected-parameters
``` python
# Parameters
INPUT_PATHS = {
"aggregated_features": "/home/lukas/Programming/uni/bachelorarbeit/fairnb/tmp/4_split/input/features.csv"
"aggregated_features": "/home/lukas/Programming/uni/bachelorarbeit/dbrepo-ismir/tmp/4_split/input/features.csv"
}
OUTPUT_PATHS = {
"split": "/home/lukas/Programming/uni/bachelorarbeit/fairnb/tmp/4_split/output/split.csv"
"split": "/home/lukas/Programming/uni/bachelorarbeit/dbrepo-ismir/tmp/4_split/output/split.csv"
}
```
%% Cell type:code id:a4cc6800 tags:
``` python
# INPUT
for path in INPUT_PATHS.values():
assert Path(path).exists()
features = pd.read_csv(INPUT_PATHS["aggregated_features"])
```
%% Cell type:code id:a186d0c4 tags:
``` python
train = features.sample(frac=0.8).sort_index()
train = features.sample(frac=0.8, random_state=11908553).sort_index()
test = features.drop(train.index)
split_true = pd.DataFrame({
"filename": train.filename,
"train": True
})
split_false = pd.DataFrame({
"filename": test.filename,
"train": False
})
split_concat = pd.concat([split_true, split_false])\
.sort_values("filename")\
.reset_index(drop=True)
```
%% Cell type:code id:091e0641 tags:
``` python
split_concat
```
%% Output
filename train
0 classical_1.mp3 False
1 classical_10.mp3 True
2 classical_100.mp3 False
3 classical_11.mp3 True
4 classical_12.mp3 True
.. ... ...
395 rock_95.mp3 False
396 rock_96.mp3 True
397 rock_97.mp3 True
398 rock_98.mp3 True
399 rock_99.mp3 True
[400 rows x 2 columns]
%% Cell type:code id:7b11b8bb tags:
``` python
# output
OUTPUT_PATH = Path(OUTPUT_PATHS["split"])
OUTPUT_PATH.parent.mkdir(parents=True, exist_ok=True)
output = split_concat
output.to_csv(OUTPUT_PATH, index=False)
```
......
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
......@@ -123,13 +123,13 @@ files = [
[[package]]
name = "anyio"
version = "4.2.0"
version = "4.3.0"
description = "High level compatibility layer for multiple asynchronous event loop implementations"
optional = false
python-versions = ">=3.8"
files = [
{file = "anyio-4.2.0-py3-none-any.whl", hash = "sha256:745843b39e829e108e518c489b31dc757de7d2131d53fac32bd8df268227bfee"},
{file = "anyio-4.2.0.tar.gz", hash = "sha256:e1875bb4b4e2de1669f4bc7869b6d3f54231cdced71605e6e64c9be77e3be50f"},
{file = "anyio-4.3.0-py3-none-any.whl", hash = "sha256:048e05d0f6caeed70d731f3db756d35dcc1f35747c8c403364a8332c630441b8"},
{file = "anyio-4.3.0.tar.gz", hash = "sha256:f75253795a87df48568485fd18cdd2a3fa5c4f7c5be8e5e36637733fce06fed6"},
]
[package.dependencies]
......@@ -2149,13 +2149,13 @@ test = ["flaky", "ipykernel (>=6.19.3)", "ipython", "ipywidgets", "nbconvert (>=
[[package]]
name = "nbconvert"
version = "7.16.0"
description = "Converting Jupyter Notebooks"
version = "7.16.1"
description = "Converting Jupyter Notebooks (.ipynb files) to other formats. Output formats include asciidoc, html, latex, markdown, pdf, py, rst, script. nbconvert can be used both as a Python library (`import nbconvert`) or as a command line tool (invoked as `jupyter nbconvert ...`)."
optional = false
python-versions = ">=3.8"
files = [
{file = "nbconvert-7.16.0-py3-none-any.whl", hash = "sha256:ad3dc865ea6e2768d31b7eb6c7ab3be014927216a5ece3ef276748dd809054c7"},
{file = "nbconvert-7.16.0.tar.gz", hash = "sha256:813e6553796362489ae572e39ba1bff978536192fb518e10826b0e8cadf03ec8"},
{file = "nbconvert-7.16.1-py3-none-any.whl", hash = "sha256:3188727dffadfdc9c6a1c7250729063d7bc78b355ad7aa023138afa030d1cd07"},
{file = "nbconvert-7.16.1.tar.gz", hash = "sha256:e79e6a074f49ba3ed29428ed86487bf51509d9aab613bd8522ac08f6d28fd7fd"},
]
[package.dependencies]
......@@ -2917,19 +2917,23 @@ typing-extensions = ">=4.6.0,<4.7.0 || >4.7.0"
[[package]]
name = "pydantic-settings"
version = "2.1.0"
version = "2.2.0"
description = "Settings management using Pydantic"
optional = false
python-versions = ">=3.8"
files = [
{file = "pydantic_settings-2.1.0-py3-none-any.whl", hash = "sha256:7621c0cb5d90d1140d2f0ef557bdf03573aac7035948109adf2574770b77605a"},
{file = "pydantic_settings-2.1.0.tar.gz", hash = "sha256:26b1492e0a24755626ac5e6d715e9077ab7ad4fb5f19a8b7ed7011d52f36141c"},
{file = "pydantic_settings-2.2.0-py3-none-any.whl", hash = "sha256:5f7bcaf9ad4419559dc5ac155c0324a9aeb2547c60471ee7c7d026f467a6b515"},
{file = "pydantic_settings-2.2.0.tar.gz", hash = "sha256:648d0a76673e69c51278979cba2e83cf16a23d57519bfd7e553d1c3f37db5560"},
]
[package.dependencies]
pydantic = ">=2.3.0"
python-dotenv = ">=0.21.0"
[package.extras]
toml = ["tomlkit (>=0.12)"]
yaml = ["pyyaml (>=6.0.1)"]
[[package]]
name = "pygments"
version = "2.17.2"
......@@ -3961,13 +3965,13 @@ dev = ["flake8", "flake8-annotations", "flake8-bandit", "flake8-bugbear", "flake
[[package]]
name = "urllib3"
version = "2.2.0"
version = "2.2.1"
description = "HTTP library with thread-safe connection pooling, file post, and more."
optional = false
python-versions = ">=3.8"
files = [
{file = "urllib3-2.2.0-py3-none-any.whl", hash = "sha256:ce3711610ddce217e6d113a2732fafad960a03fd0318c91faa79481e35c11224"},
{file = "urllib3-2.2.0.tar.gz", hash = "sha256:051d961ad0c62a94e50ecf1af379c3aba230c66c710493493560c0c223c49f20"},
{file = "urllib3-2.2.1-py3-none-any.whl", hash = "sha256:450b20ec296a467077128bff42b73080516e71b56ff59a60a02bef2232c4fa9d"},
{file = "urllib3-2.2.1.tar.gz", hash = "sha256:d0570876c61ab9e520d776c38acbbb5b05a776d3f9ff98a5c8fd5162a444cf19"},
]
[package.extras]
......
access:
files: public
files: restricted
record: public
files:
default_preview: null
......@@ -24,13 +24,13 @@ metadata:
publisher: TU Wien
related_identifiers:
- identifier: https://www2.projects.science.uu.nl/memotion/emotifydata/
relation:
relation_type:
id: isderivedfrom
resource_type:
id: sound
scheme: url
- identifier: https://gitlab.tuwien.ac.at/martin.weise/fairnb
relation:
relation_type:
id: isderivedfrom
resource_type:
id: software
......