import pandas as pd
import warnings
warnings.filterwarnings('ignore')
data = pd.read_csv("data/svm-hyperparameters-train-features.csv")
data.head()
|
Pclass
|
Sex
|
Age
|
SibSp
|
Parch
|
Fare
|
0
|
3
|
1
|
22.0
|
1
|
0
|
7.2500
|
1
|
1
|
0
|
38.0
|
1
|
0
|
71.2833
|
2
|
3
|
0
|
26.0
|
0
|
0
|
7.9250
|
3
|
1
|
0
|
35.0
|
1
|
0
|
53.1000
|
4
|
3
|
1
|
35.0
|
0
|
0
|
8.0500
|
data.describe(include='all')
|
Pclass
|
Sex
|
Age
|
SibSp
|
Parch
|
Fare
|
count
|
891.000000
|
891.000000
|
891.000000
|
891.000000
|
891.000000
|
891.000000
|
mean
|
2.308642
|
0.647587
|
29.758889
|
0.523008
|
0.381594
|
32.204208
|
std
|
0.836071
|
0.477990
|
13.002570
|
1.102743
|
0.806057
|
49.693429
|
min
|
1.000000
|
0.000000
|
0.420000
|
0.000000
|
0.000000
|
0.000000
|
25%
|
2.000000
|
0.000000
|
22.000000
|
0.000000
|
0.000000
|
7.910400
|
50%
|
3.000000
|
1.000000
|
30.000000
|
0.000000
|
0.000000
|
14.454200
|
75%
|
3.000000
|
1.000000
|
35.000000
|
1.000000
|
0.000000
|
31.000000
|
max
|
3.000000
|
1.000000
|
80.000000
|
8.000000
|
6.000000
|
512.329200
|
from sdv.tabular import CopulaGAN
new_data = model.sample(200)
|
Pclass
|
Sex
|
Age
|
SibSp
|
Parch
|
Fare
|
0
|
3
|
0
|
32.02
|
0
|
0
|
18.2784
|
1
|
3
|
1
|
20.90
|
5
|
0
|
109.6910
|
2
|
3
|
0
|
40.80
|
0
|
1
|
179.5139
|
3
|
3
|
0
|
33.13
|
1
|
0
|
17.3447
|
4
|
1
|
1
|
20.62
|
0
|
0
|
9.6040
|
new_data.describe(include='all')
|
Pclass
|
Sex
|
Age
|
SibSp
|
Parch
|
Fare
|
count
|
200.000000
|
200.000000
|
200.000000
|
200.000000
|
200.000000
|
200.000000
|
mean
|
2.490000
|
0.450000
|
31.404700
|
0.405000
|
0.115000
|
46.756109
|
std
|
0.820528
|
0.498742
|
12.297721
|
0.919348
|
0.461345
|
52.828023
|
min
|
1.000000
|
0.000000
|
0.450000
|
0.000000
|
0.000000
|
3.034900
|
25%
|
2.000000
|
0.000000
|
24.712500
|
0.000000
|
0.000000
|
11.768475
|
50%
|
3.000000
|
0.000000
|
30.295000
|
0.000000
|
0.000000
|
27.859150
|
75%
|
3.000000
|
1.000000
|
34.307500
|
1.000000
|
0.000000
|
58.361350
|
max
|
3.000000
|
1.000000
|
73.340000
|
7.000000
|
3.000000
|
381.859600
|
from sdv.evaluation import evaluate
evaluate(new_data, data)
0.539229052965023
model = CopulaGAN(
field_transformers={
'Pclass': 'categorical',
'Sex': 'categorical',
'Age': 'float',
'SibSp': 'boolean',
'Parch': 'integer',
'Fare': 'float'
},
field_distributions={
'Fare': 'truncated_gaussian'
}
)
new_data = model.sample(200)
|
Pclass
|
Sex
|
Age
|
SibSp
|
Parch
|
Fare
|
0
|
1
|
0
|
1.07
|
1
|
0
|
13.8318
|
1
|
3
|
0
|
15.58
|
0
|
0
|
46.6937
|
2
|
1
|
1
|
26.53
|
0
|
0
|
53.8841
|
3
|
1
|
0
|
29.58
|
0
|
0
|
5.2646
|
4
|
1
|
0
|
30.13
|
0
|
0
|
5.1415
|
new_data.describe(include='all')
|
Pclass
|
Sex
|
Age
|
SibSp
|
Parch
|
Fare
|
count
|
200.000000
|
200.000000
|
200.000000
|
200.000000
|
200.000000
|
200.000000
|
mean
|
2.210000
|
0.410000
|
22.935600
|
0.550000
|
0.145000
|
49.155439
|
std
|
0.932873
|
0.493068
|
13.955182
|
0.498742
|
0.441531
|
42.733997
|
min
|
1.000000
|
0.000000
|
0.430000
|
0.000000
|
0.000000
|
4.749400
|
25%
|
1.000000
|
0.000000
|
13.427500
|
0.000000
|
0.000000
|
13.340950
|
50%
|
3.000000
|
0.000000
|
25.875000
|
1.000000
|
0.000000
|
34.373700
|
75%
|
3.000000
|
1.000000
|
30.225000
|
1.000000
|
0.000000
|
75.206275
|
max
|
3.000000
|
1.000000
|
71.340000
|
1.000000
|
3.000000
|
220.761200
|
0.4983713994365315