***
Wartungsfenster jeden ersten Mittwoch vormittag im Monat
***
Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
fairnb
Manage
Activity
Members
Labels
Plan
Issues
0
Issue boards
Milestones
Wiki
Requirements
Code
Merge requests
0
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Locked files
Deploy
Releases
Package Registry
Container Registry
Model registry
Operate
Terraform modules
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
Repository analytics
Code review analytics
Issue analytics
Insights
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Terms and privacy
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
Weise, Martin
fairnb
Commits
cca0801c
Commit
cca0801c
authored
2 years ago
by
Mahler, Lukas
Browse files
Options
Downloads
Patches
Plain Diff
refactor SVC creation
parent
b016d487
No related branches found
Branches containing commit
No related tags found
Tags containing commit
1 merge request
!1
Dev
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
src/dbrepo-ismir/ml_analysis.ipynb
+32
-15
32 additions, 15 deletions
src/dbrepo-ismir/ml_analysis.ipynb
with
32 additions
and
15 deletions
src/dbrepo-ismir/ml_analysis.ipynb
+
32
−
15
View file @
cca0801c
...
...
@@ -62,7 +62,7 @@
"metadata": {},
"outputs": [],
"source": [
"train: pd.DataFrame = mfcc_merged.sample(frac = 0.8)\n",
"train: pd.DataFrame = mfcc_merged.sample(frac = 0.8)
.sort_index()
\n",
"\n",
"train"
]
...
...
@@ -98,7 +98,9 @@
" index2classname[i] = classname\n",
"\n",
"# map genre to genre index \n",
"y = np.array([classname2index[classname] for classname in train_metadata.genre.values])"
"y = np.array([classname2index[classname] for classname in train.genre.values])\n",
"\n",
"(X, y)"
]
},
{
...
...
@@ -107,11 +109,16 @@
"metadata": {},
"outputs": [],
"source": [
"X_test
: pd.DataFrame = test
\n",
"X_test
= test.drop(meta_columns, level=0, axis=1, errors='ignore')
\n",
"\n",
"print(X.shape)\n",
"print(X_test.shape)\n",
"print(X_test.shape[0] / X.shape[0])"
"print(X_test.shape[0] / X.shape[0])\n",
"X_test\n",
"\n",
"y_test = np.array([classname2index[classname] for classname in test.genre.values])\n",
"\n",
"y_test"
]
},
{
...
...
@@ -124,7 +131,9 @@
"\n",
"scaler = StandardScaler()\n",
"X_standardized = scaler.fit_transform(X.values)\n",
"X_test_standardized = scaler.transform(X_test.values)"
"X_test_standardized = scaler.transform(X_test.values)\n",
"\n",
"X_standardized"
]
},
{
...
...
@@ -138,7 +147,11 @@
"X_pca = pca.transform(X_standardized)\n",
"X_test_pca = pca.transform(X_test_standardized)\n",
"\n",
"print(sum(pca.explained_variance_ratio_))"
"print(sum(pca.explained_variance_ratio_))\n",
"\n",
"print(X_pca.shape)\n",
"print(X_test_pca.shape)\n",
"print(y.shape)"
]
},
{
...
...
@@ -149,13 +162,15 @@
"source": [
"# Fit SVM:\n",
"\n",
"# TODO: ok for multiple rows per file?\n",
"X_train, X_val, y_train, y_val = train_test_split(X_pca, y, test_size = 0.2, random_state=4, shuffle = True)\n",
"\n",
"clf = SVC(kernel
=
'rbf', probability
=
True)\n",
"clf = SVC(kernel
=
'rbf', probability
=
True)\n",
"clf.fit(X_train, y_train)\n",
"\n",
"print(accuracy_score(clf.predict(X_val), y_val))"
"print(accuracy_score(clf.predict(X_val), y_val))\n",
"\n",
"print(X_val)\n",
"print(y_val)"
]
},
{
...
...
@@ -165,17 +180,18 @@
"outputs": [],
"source": [
"# grid for C, gamma\n",
"C_grid = [0.001, 0.01, 0.1, 1, 10]\n",
"C_grid = [0.001, 0.01, 0.1, 1,
2, 3, 4, 5, 6, 7, 8, 9,
10]\n",
"gamma_grid = [0.001, 0.01, 0.1, 1, 10]\n",
"param_grid = {'C': C_grid, 'gamma': gamma_grid}\n",
"\n",
"grid = GridSearchCV(SVC(kernel='rbf'), param_grid, cv=
3
, scoring=\"accuracy\")\n",
"grid = GridSearchCV(SVC(kernel='rbf'), param_grid, cv=
5
, scoring=\"accuracy\")\n",
"grid.fit(X_train, y_train)\n",
"\n",
"# Find the best model\n",
"print(grid.best_score_)\n",
"print(grid.best_params_)\n",
"print(grid.best_estimator_)"
"print(grid.best_estimator_)\n",
"print(accuracy_score(grid.predict(X_val), y_val))"
]
},
{
...
...
@@ -186,7 +202,7 @@
"source": [
"# Optimal model\n",
"\n",
"clf = SVC(kernel
=
'rbf', C=4, gamma=0.01, probability=True)\n",
"clf = SVC(kernel
=
'rbf', C=4, gamma=0.01, probability=True)\n",
"clf.fit(X_train, y_train)\n",
"\n",
"print(accuracy_score(clf.predict(X_val), y_val))"
...
...
@@ -201,8 +217,9 @@
"# Fit entire training sets\n",
"clf.fit(X_pca, y)\n",
"\n",
"clf.predict_proba(X_test_pca)\n",
"\n"
"print(accuracy_score(clf.predict(X_test_pca), y_test))\n",
"\n",
"clf.predict_proba(X_test_pca)"
]
},
{
...
...
%% Cell type:code id: tags:
```
python
# Base: https://www.kaggle.com/code/anmour/svm-using-mfcc-features/notebook
import
pandas
as
pd
import
numpy
as
np
import
os
import
librosa
import
scipy
from
scipy.stats
import
skew
from
tqdm
import
tqdm
,
tqdm_pandas
tqdm
.
pandas
()
from
sklearn.model_selection
import
GridSearchCV
from
sklearn.model_selection
import
train_test_split
from
sklearn.metrics
import
accuracy_score
from
sklearn.preprocessing
import
StandardScaler
from
sklearn.decomposition
import
PCA
from
sklearn.svm
import
SVC
from
pathlib
import
Path
BASE_PATH
=
Path
(
""
).
absolute
().
parent
.
parent
meta_columns
=
[
"
filename
"
,
"
sample
"
,
"
genre
"
,
"
filenumber
"
]
# 22 / 73
```
%% Cell type:code id: tags:
```
python
mfcc_raw
:
pd
.
DataFrame
=
pd
.
read_pickle
(
BASE_PATH
/
"
out
"
/
"
mfcc
"
/
"
mfcc.pickle
"
).
set_index
(
'
filename
'
)
# Aggregate from n rows per file to 1 (calculate min, max, etc. for each feature)
mfcc_aggregated
=
mfcc_raw
.
drop
(
meta_columns
,
axis
=
1
,
errors
=
'
ignore
'
).
groupby
(
mfcc_raw
.
index
).
agg
([
'
min
'
,
'
max
'
,
'
mean
'
,
'
std
'
,
'
skew
'
])
mfcc_meta
=
pd
.
DataFrame
(
mfcc_raw
[
'
genre
'
].
groupby
(
mfcc_raw
.
index
).
last
())
mfcc_meta
.
columns
=
pd
.
MultiIndex
.
from_arrays
([[
'
genre
'
],
[
''
]])
mfcc_merged
=
pd
.
merge
(
mfcc_meta
,
mfcc_aggregated
,
left_index
=
True
,
right_index
=
True
)
# print(mfcc_aggregated)
# print(mfcc_aggregated_meta)
mfcc_merged
```
%% Cell type:code id: tags:
```
python
train
:
pd
.
DataFrame
=
mfcc_merged
.
sample
(
frac
=
0.8
)
train
:
pd
.
DataFrame
=
mfcc_merged
.
sample
(
frac
=
0.8
)
.
sort_index
()
train
```
%% Cell type:code id: tags:
```
python
test
:
pd
.
DataFrame
=
mfcc_merged
.
drop
(
train
.
index
)
test
```
%% Cell type:code id: tags:
```
python
# remove labels
X
=
train
.
drop
(
meta_columns
,
level
=
0
,
axis
=
1
,
errors
=
'
ignore
'
)
columns
:
pd
.
Index
=
X
.
columns
classnames
=
np
.
sort
(
np
.
unique
(
mfcc_merged
.
genre
.
values
))
# => ["classical", "electric", "pop", "rock"]
# map classname to an index and create dicts for easy lookup in O(1)
classname2index
=
{}
index2classname
=
{}
for
i
,
classname
in
enumerate
(
classnames
):
classname2index
[
classname
]
=
i
index2classname
[
i
]
=
classname
# map genre to genre index
y
=
np
.
array
([
classname2index
[
classname
]
for
classname
in
train_metadata
.
genre
.
values
])
y
=
np
.
array
([
classname2index
[
classname
]
for
classname
in
train
.
genre
.
values
])
(
X
,
y
)
```
%% Cell type:code id: tags:
```
python
X_test
:
pd
.
DataFrame
=
test
X_test
=
test
.
drop
(
meta_columns
,
level
=
0
,
axis
=
1
,
errors
=
'
ignore
'
)
print
(
X
.
shape
)
print
(
X_test
.
shape
)
print
(
X_test
.
shape
[
0
]
/
X
.
shape
[
0
])
X_test
y_test
=
np
.
array
([
classname2index
[
classname
]
for
classname
in
test
.
genre
.
values
])
y_test
```
%% Cell type:code id: tags:
```
python
# Standardize for PCA
scaler
=
StandardScaler
()
X_standardized
=
scaler
.
fit_transform
(
X
.
values
)
X_test_standardized
=
scaler
.
transform
(
X_test
.
values
)
X_standardized
```
%% Cell type:code id: tags:
```
python
# Reduce Dimensions via PCA
pca
=
PCA
(
n_components
=
50
).
fit
(
X_standardized
)
X_pca
=
pca
.
transform
(
X_standardized
)
X_test_pca
=
pca
.
transform
(
X_test_standardized
)
print
(
sum
(
pca
.
explained_variance_ratio_
))
print
(
X_pca
.
shape
)
print
(
X_test_pca
.
shape
)
print
(
y
.
shape
)
```
%% Cell type:code id: tags:
```
python
# Fit SVM:
# TODO: ok for multiple rows per file?
X_train
,
X_val
,
y_train
,
y_val
=
train_test_split
(
X_pca
,
y
,
test_size
=
0.2
,
random_state
=
4
,
shuffle
=
True
)
clf
=
SVC
(
kernel
=
'
rbf
'
,
probability
=
True
)
clf
=
SVC
(
kernel
=
'
rbf
'
,
probability
=
True
)
clf
.
fit
(
X_train
,
y_train
)
print
(
accuracy_score
(
clf
.
predict
(
X_val
),
y_val
))
print
(
X_val
)
print
(
y_val
)
```
%% Cell type:code id: tags:
```
python
# grid for C, gamma
C_grid
=
[
0.001
,
0.01
,
0.1
,
1
,
10
]
C_grid
=
[
0.001
,
0.01
,
0.1
,
1
,
2
,
3
,
4
,
5
,
6
,
7
,
8
,
9
,
10
]
gamma_grid
=
[
0.001
,
0.01
,
0.1
,
1
,
10
]
param_grid
=
{
'
C
'
:
C_grid
,
'
gamma
'
:
gamma_grid
}
grid
=
GridSearchCV
(
SVC
(
kernel
=
'
rbf
'
),
param_grid
,
cv
=
3
,
scoring
=
"
accuracy
"
)
grid
=
GridSearchCV
(
SVC
(
kernel
=
'
rbf
'
),
param_grid
,
cv
=
5
,
scoring
=
"
accuracy
"
)
grid
.
fit
(
X_train
,
y_train
)
# Find the best model
print
(
grid
.
best_score_
)
print
(
grid
.
best_params_
)
print
(
grid
.
best_estimator_
)
print
(
accuracy_score
(
grid
.
predict
(
X_val
),
y_val
))
```
%% Cell type:code id: tags:
```
python
# Optimal model
clf
=
SVC
(
kernel
=
'
rbf
'
,
C
=
4
,
gamma
=
0.01
,
probability
=
True
)
clf
=
SVC
(
kernel
=
'
rbf
'
,
C
=
4
,
gamma
=
0.01
,
probability
=
True
)
clf
.
fit
(
X_train
,
y_train
)
print
(
accuracy_score
(
clf
.
predict
(
X_val
),
y_val
))
```
%% Cell type:code id: tags:
```
python
# Fit entire training sets
clf
.
fit
(
X_pca
,
y
)
clf
.
predict
_proba
(
X_test_pca
)
print
(
accuracy_score
(
clf
.
predict
(
X_test_pca
)
,
y_test
))
clf
.
predict_proba
(
X_test_pca
)
```
%% Cell type:code id: tags:
```
python
# Fit the entire training sets
# def convert_to_labels(preds, i2c, k=3):
# ans = []
# ids = []
# for p in preds:
# idx = np.argsort(p)[::-1]
# ids.append([i for i in idx[:k]])
# ans.append(' '.join([i2c[i] for i in idx[:k]]))
# return ans, ids
# clf.fit(X_pca, y)
# str_preds, _ = convert_to_labels(clf.predict_proba(X_test_pca), i2c, k=3)
# # Write to outputs
# subm = pd.DataFrame()
# subm['fname'] = audio_test_files
# subm['label'] = str_preds
# subm.to_csv('submission.csv', index=False)
```
...
...
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment