*** Wartungsfenster jeden ersten Mittwoch vormittag im Monat ***

Skip to content
Snippets Groups Projects
3_aggregate_features.ipynb 7.62 KiB
Newer Older
{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "f48a4573",
   "metadata": {
    "papermill": {
     "duration": 0.006395,
     "end_time": "2023-08-21T16:05:27.959997",
     "exception": false,
     "start_time": "2023-08-21T16:05:27.953602",
     "status": "completed"
    },
    "tags": []
   },
   "source": [
    "# Aggregate MFCC Features\n",
    "\n",
    "Aggregate from n rows par file to 1 (calculate min, max, etc. for each feature)."
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "389576b8",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2023-08-14T15:32:41.535589478Z",
     "start_time": "2023-08-14T15:32:40.986222405Z"
    },
    "collapsed": true,
    "execution": {
     "iopub.execute_input": "2023-08-21T16:05:27.992117Z",
     "iopub.status.busy": "2023-08-21T16:05:27.991058Z",
     "iopub.status.idle": "2023-08-21T16:05:29.085230Z",
     "shell.execute_reply": "2023-08-21T16:05:29.083759Z"
     "duration": 1.113632,
     "end_time": "2023-08-21T16:05:29.090761",
     "exception": false,
     "start_time": "2023-08-21T16:05:27.977129",
     "status": "completed"
    },
    "tags": []
   },
   "outputs": [],
   "source": [
    "from pathlib import Path\n",
    "\n",
    "import pandas as pd\n",
    "from definitions import BASE_PATH"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "26f640e0",
   "metadata": {
     "iopub.execute_input": "2023-08-21T16:05:29.116428Z",
     "iopub.status.busy": "2023-08-21T16:05:29.114006Z",
     "iopub.status.idle": "2023-08-21T16:05:29.126919Z",
     "shell.execute_reply": "2023-08-21T16:05:29.124758Z"
     "duration": 0.028512,
     "end_time": "2023-08-21T16:05:29.131713",
     "exception": false,
     "start_time": "2023-08-21T16:05:29.103201",
     "status": "completed"
    },
    "tags": [
     "parameters"
    ]
   },
   "outputs": [],
   "source": [
    "# INPUT_PATH = BASE_PATH / \"tmp\" / \"3_aggregate_features\" / \"input\"\n",
    "# OUTPUT_PATH = BASE_PATH / \"tmp\" / \"3_aggregate_features\" / \"output\"\n",
    "#\n",
    "# INPUT_PATHS: dict[str, str] = {\n",
    "#     \"raw_features\": (INPUT_PATH / \"raw_features.csv\").__str__()\n",
    "# }\n",
    "#\n",
    "# OUTPUT_PATHS: dict[str, str] = {\n",
    "#     \"features\": (OUTPUT_PATH / \"features.csv\").__str__()\n",
    "# }\n",
    "INPUT_PATHS: dict[str, str] = {}\n",
    "OUTPUT_PATHS: dict[str, str] = {}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "f1e624eb",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2023-08-21T16:05:29.153037Z",
     "iopub.status.busy": "2023-08-21T16:05:29.152027Z",
     "iopub.status.idle": "2023-08-21T16:05:29.161973Z",
     "shell.execute_reply": "2023-08-21T16:05:29.159663Z"
     "duration": 0.025025,
     "end_time": "2023-08-21T16:05:29.165408",
     "exception": false,
     "start_time": "2023-08-21T16:05:29.140383",
     "status": "completed"
    },
    "tags": [
     "injected-parameters"
    ]
   },
   "outputs": [],
   "source": [
    "# Parameters\n",
    "INPUT_PATHS = {\n",
    "    \"raw_features\": \"/home/lukas/Programming/uni/bachelorarbeit/dbrepo-ismir/tmp/3_aggregate_features/input/raw_features.csv\"\n",
    "OUTPUT_PATHS = {\n",
    "    \"aggregated_features\": \"/home/lukas/Programming/uni/bachelorarbeit/dbrepo-ismir/tmp/3_aggregate_features/output/features.csv\"\n",
    "}\n"
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "c5d9d980",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2023-08-21T16:05:29.183331Z",
     "iopub.status.busy": "2023-08-21T16:05:29.181976Z",
     "iopub.status.idle": "2023-08-21T16:05:47.896449Z",
     "shell.execute_reply": "2023-08-21T16:05:47.894434Z"
     "duration": 18.730379,
     "end_time": "2023-08-21T16:05:47.901647",
     "exception": false,
     "start_time": "2023-08-21T16:05:29.171268",
     "status": "completed"
    },
    "tags": []
   },
   "outputs": [],
   "source": [
    "# inputs\n",
    "raw_features = pd.read_csv(INPUT_PATHS[\"raw_features\"], index_col=False)"
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "99f75f47",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2023-08-14T16:12:29.198485Z",
     "iopub.status.busy": "2023-08-14T16:12:29.197413Z",
     "iopub.status.idle": "2023-08-14T16:12:40.483527Z",
     "shell.execute_reply": "2023-08-14T16:12:40.482234Z"
    },
    "papermill": {
     "duration": null,
     "end_time": null,
     "exception": false,
     "start_time": "2023-08-21T16:05:47.909658",
     "status": "running"
   "source": [
    "meta_columns = [\"sample\", \"filename\", \"label\"]\n",
    "mfcc_aggregated = raw_features\\\n",
    "    .drop(meta_columns, axis=1, errors='ignore')\\\n",
    "    .groupby(raw_features.filename).agg(['min', 'max', 'mean', 'std', 'skew'])\n",
    "\n",
    "mfcc_meta = pd.DataFrame(raw_features['label'].groupby(raw_features.filename).last())\n",
    "mfcc_meta.columns = pd.MultiIndex.from_arrays([['label'], ['']])    # needed for merge\n",
    "mfcc_merged = pd.merge(mfcc_meta, mfcc_aggregated, left_index=True, right_index=True)\n",
    "\n",
    "# reduce multi index to single index\n",
    "one_level_cols = ['_'.join([str(el) for el in col]) for col in mfcc_merged.columns[1:]]\n",
    "one_level_cols.insert(0, \"label\")\n",
    "\n",
    "mfcc_merged.columns = pd.Index(one_level_cols)\n",
    "mfcc_merged = mfcc_merged.reset_index()\n",
    "mfcc_merged"
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "4ac5c765",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2023-08-14T16:12:40.510391Z",
     "iopub.status.busy": "2023-08-14T16:12:40.509065Z",
     "iopub.status.idle": "2023-08-14T16:12:40.758881Z",
     "shell.execute_reply": "2023-08-14T16:12:40.757493Z"
    },
    "papermill": {
     "duration": null,
     "end_time": null,
     "exception": null,
     "start_time": null,
     "status": "pending"
   "outputs": [],
   "source": [
    "# outputs\n",
    "aggregated_features_path = Path(OUTPUT_PATHS[\"aggregated_features\"]).resolve()\n",
    "aggregated_features_path.parent.mkdir(parents=True, exist_ok=True)\n",
    "output = mfcc_merged\n",
    "output.to_csv(aggregated_features_path, index=False)"
  }
 ],
 "metadata": {
  "celltoolbar": "Tags",
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.12"
  },
  "papermill": {
   "default_parameters": {},
   "duration": null,
   "end_time": null,
   "environment_variables": {},
   "exception": null,
   "input_path": "/home/lukas/Programming/uni/bachelorarbeit/dbrepo-ismir/notebooks/3_aggregate_features.ipynb",
   "output_path": "/home/lukas/Programming/uni/bachelorarbeit/dbrepo-ismir/notebooks/3_aggregate_features.ipynb",
   "parameters": {
    "INPUT_PATHS": {
     "raw_features": "/home/lukas/Programming/uni/bachelorarbeit/dbrepo-ismir/tmp/3_aggregate_features/input/raw_features.csv"
    },
    "OUTPUT_PATHS": {
     "aggregated_features": "/home/lukas/Programming/uni/bachelorarbeit/dbrepo-ismir/tmp/3_aggregate_features/output/features.csv"
    }
   },
   "start_time": "2023-08-21T16:05:26.283764",
   "version": "2.4.0"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5