*** Wartungsfenster jeden ersten Mittwoch vormittag im Monat ***

Skip to content
Snippets Groups Projects
Commit cca0801c authored by Mahler, Lukas's avatar Mahler, Lukas
Browse files

refactor SVC creation

parent b016d487
No related branches found
No related tags found
1 merge request!1Dev
%% Cell type:code id: tags:
``` python
# Base: https://www.kaggle.com/code/anmour/svm-using-mfcc-features/notebook
import pandas as pd
import numpy as np
import os
import librosa
import scipy
from scipy.stats import skew
from tqdm import tqdm, tqdm_pandas
tqdm.pandas()
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.svm import SVC
from pathlib import Path
BASE_PATH = Path("").absolute().parent.parent
meta_columns = ["filename", "sample", "genre", "filenumber"]
# 22 / 73
```
%% Cell type:code id: tags:
``` python
mfcc_raw: pd.DataFrame = pd.read_pickle(BASE_PATH / "out" / "mfcc" / "mfcc.pickle").set_index('filename')
# Aggregate from n rows per file to 1 (calculate min, max, etc. for each feature)
mfcc_aggregated = mfcc_raw.drop(meta_columns, axis=1, errors='ignore').groupby(mfcc_raw.index).agg(['min', 'max', 'mean', 'std', 'skew'])
mfcc_meta = pd.DataFrame(mfcc_raw['genre'].groupby(mfcc_raw.index).last())
mfcc_meta.columns = pd.MultiIndex.from_arrays([['genre'], ['']])
mfcc_merged = pd.merge(mfcc_meta, mfcc_aggregated, left_index=True, right_index=True)
# print(mfcc_aggregated)
# print(mfcc_aggregated_meta)
mfcc_merged
```
%% Cell type:code id: tags:
``` python
train: pd.DataFrame = mfcc_merged.sample(frac = 0.8)
train: pd.DataFrame = mfcc_merged.sample(frac = 0.8).sort_index()
train
```
%% Cell type:code id: tags:
``` python
test: pd.DataFrame = mfcc_merged.drop(train.index)
test
```
%% Cell type:code id: tags:
``` python
# remove labels
X = train.drop(meta_columns, level=0, axis=1, errors='ignore')
columns: pd.Index = X.columns
classnames = np.sort(np.unique(mfcc_merged.genre.values)) # => ["classical", "electric", "pop", "rock"]
# map classname to an index and create dicts for easy lookup in O(1)
classname2index = {}
index2classname = {}
for i, classname in enumerate(classnames):
classname2index[classname] = i
index2classname[i] = classname
# map genre to genre index
y = np.array([classname2index[classname] for classname in train_metadata.genre.values])
y = np.array([classname2index[classname] for classname in train.genre.values])
(X, y)
```
%% Cell type:code id: tags:
``` python
X_test: pd.DataFrame = test
X_test = test.drop(meta_columns, level=0, axis=1, errors='ignore')
print(X.shape)
print(X_test.shape)
print(X_test.shape[0] / X.shape[0])
X_test
y_test = np.array([classname2index[classname] for classname in test.genre.values])
y_test
```
%% Cell type:code id: tags:
``` python
# Standardize for PCA
scaler = StandardScaler()
X_standardized = scaler.fit_transform(X.values)
X_test_standardized = scaler.transform(X_test.values)
X_standardized
```
%% Cell type:code id: tags:
``` python
# Reduce Dimensions via PCA
pca = PCA(n_components=50).fit(X_standardized)
X_pca = pca.transform(X_standardized)
X_test_pca = pca.transform(X_test_standardized)
print(sum(pca.explained_variance_ratio_))
print(X_pca.shape)
print(X_test_pca.shape)
print(y.shape)
```
%% Cell type:code id: tags:
``` python
# Fit SVM:
# TODO: ok for multiple rows per file?
X_train, X_val, y_train, y_val = train_test_split(X_pca, y, test_size = 0.2, random_state=4, shuffle = True)
clf = SVC(kernel = 'rbf', probability = True)
clf = SVC(kernel='rbf', probability=True)
clf.fit(X_train, y_train)
print(accuracy_score(clf.predict(X_val), y_val))
print(X_val)
print(y_val)
```
%% Cell type:code id: tags:
``` python
# grid for C, gamma
C_grid = [0.001, 0.01, 0.1, 1, 10]
C_grid = [0.001, 0.01, 0.1, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
gamma_grid = [0.001, 0.01, 0.1, 1, 10]
param_grid = {'C': C_grid, 'gamma': gamma_grid}
grid = GridSearchCV(SVC(kernel='rbf'), param_grid, cv=3, scoring="accuracy")
grid = GridSearchCV(SVC(kernel='rbf'), param_grid, cv=5, scoring="accuracy")
grid.fit(X_train, y_train)
# Find the best model
print(grid.best_score_)
print(grid.best_params_)
print(grid.best_estimator_)
print(accuracy_score(grid.predict(X_val), y_val))
```
%% Cell type:code id: tags:
``` python
# Optimal model
clf = SVC(kernel = 'rbf', C=4, gamma=0.01, probability=True)
clf = SVC(kernel='rbf', C=4, gamma=0.01, probability=True)
clf.fit(X_train, y_train)
print(accuracy_score(clf.predict(X_val), y_val))
```
%% Cell type:code id: tags:
``` python
# Fit entire training sets
clf.fit(X_pca, y)
clf.predict_proba(X_test_pca)
print(accuracy_score(clf.predict(X_test_pca), y_test))
clf.predict_proba(X_test_pca)
```
%% Cell type:code id: tags:
``` python
# Fit the entire training sets
# def convert_to_labels(preds, i2c, k=3):
# ans = []
# ids = []
# for p in preds:
# idx = np.argsort(p)[::-1]
# ids.append([i for i in idx[:k]])
# ans.append(' '.join([i2c[i] for i in idx[:k]]))
# return ans, ids
# clf.fit(X_pca, y)
# str_preds, _ = convert_to_labels(clf.predict_proba(X_test_pca), i2c, k=3)
# # Write to outputs
# subm = pd.DataFrame()
# subm['fname'] = audio_test_files
# subm['label'] = str_preds
# subm.to_csv('submission.csv', index=False)
```
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment