import pandas as pd
data = pd.read_csv("data/svm-hyperparameters-train-features.csv")
|
Pclass
|
Sex
|
Age
|
SibSp
|
Parch
|
Fare
|
0
|
3
|
1
|
22.000
|
1
|
0
|
7.250
|
1
|
1
|
0
|
38.000
|
1
|
0
|
71.283
|
2
|
3
|
0
|
26.000
|
0
|
0
|
7.925
|
3
|
1
|
0
|
35.000
|
1
|
0
|
53.100
|
4
|
3
|
1
|
35.000
|
0
|
0
|
8.050
|
data.describe(include="all")
|
Pclass
|
Sex
|
Age
|
SibSp
|
Parch
|
Fare
|
count
|
891.000
|
891.000
|
891.000
|
891.000
|
891.000
|
891.000
|
mean
|
2.309
|
0.648
|
29.759
|
0.523
|
0.382
|
32.204
|
std
|
0.836
|
0.478
|
13.003
|
1.103
|
0.806
|
49.693
|
min
|
1.000
|
0.000
|
0.420
|
0.000
|
0.000
|
0.000
|
25%
|
2.000
|
0.000
|
22.000
|
0.000
|
0.000
|
7.910
|
50%
|
3.000
|
1.000
|
30.000
|
0.000
|
0.000
|
14.454
|
75%
|
3.000
|
1.000
|
35.000
|
1.000
|
0.000
|
31.000
|
max
|
3.000
|
1.000
|
80.000
|
8.000
|
6.000
|
512.329
|
from sdv.tabular import GaussianCopula
new_df = model.sample(N_SAMPLES)
|
Pclass
|
Sex
|
Age
|
SibSp
|
Parch
|
Fare
|
0
|
3
|
1
|
12.780
|
1
|
1
|
2.070
|
1
|
2
|
1
|
44.930
|
0
|
0
|
95.806
|
2
|
2
|
1
|
21.980
|
1
|
0
|
72.020
|
3
|
2
|
0
|
30.120
|
2
|
1
|
84.968
|
4
|
2
|
1
|
23.480
|
1
|
1
|
37.170
|
|
Pclass
|
Sex
|
Age
|
SibSp
|
Parch
|
Fare
|
count
|
1000.000
|
1000.000
|
1000.000
|
1000.000
|
1000.000
|
1000.000
|
mean
|
2.127
|
0.595
|
30.168
|
0.972
|
0.671
|
46.234
|
std
|
0.697
|
0.491
|
13.224
|
0.786
|
0.666
|
35.294
|
min
|
1.000
|
0.000
|
0.620
|
0.000
|
0.000
|
0.180
|
25%
|
2.000
|
0.000
|
20.693
|
0.000
|
0.000
|
16.959
|
50%
|
2.000
|
1.000
|
29.795
|
1.000
|
1.000
|
38.102
|
75%
|
3.000
|
1.000
|
39.163
|
1.000
|
1.000
|
67.841
|
max
|
3.000
|
1.000
|
77.000
|
4.000
|
3.000
|
166.853
|

<ggplot: (338236528)>

<ggplot: (338395612)>

<ggplot: (338577035)>

<ggplot: (338383996)>
model = GaussianCopula(
field_transformers={
'Pclass': 'categorical',
'Sex': 'categorical',
'Age': 'float',
'SibSp': 'boolean',
'Parch': 'integer',
'Fare': 'float'
}
)
new_df = model.sample(N_SAMPLES)
|
Pclass
|
Sex
|
Age
|
SibSp
|
Parch
|
Fare
|
0
|
3
|
0
|
24.330
|
0
|
1
|
24.759
|
1
|
1
|
1
|
58.070
|
0
|
0
|
22.957
|
2
|
2
|
0
|
23.700
|
0
|
0
|
15.298
|
3
|
1
|
1
|
30.200
|
0
|
0
|
19.092
|
4
|
1
|
1
|
51.370
|
0
|
0
|
99.106
|

<ggplot: (338395206)>

<ggplot: (338261950)>

<ggplot: (338558748)>

<ggplot: (338314674)>
count 891.000
mean 32.204
std 49.693
min 0.000
25% 7.910
50% 14.454
75% 31.000
max 512.329
Name: Fare, dtype: float64
distributions = model.get_distributions()
distributions
{'Pclass.value': 'copulas.univariate.truncated_gaussian.TruncatedGaussian',
'Sex.value': 'copulas.univariate.truncated_gaussian.TruncatedGaussian',
'Age.value': 'copulas.univariate.truncated_gaussian.TruncatedGaussian',
'SibSp.value': 'copulas.univariate.truncated_gaussian.TruncatedGaussian',
'Parch.value': 'copulas.univariate.truncated_gaussian.TruncatedGaussian',
'Fare.value': 'copulas.univariate.truncated_gaussian.TruncatedGaussian'}
model = GaussianCopula(
field_transformers={
'Pclass': 'categorical',
'Sex': 'categorical',
'Age': 'float',
'SibSp': 'boolean',
'Parch': 'integer',
'Fare': 'float'
},
field_distributions={
'Fare': 'truncated_gaussian'
}
)
new_df = model.sample(N_SAMPLES)
new_df.Fare.describe()
count 1000.000
mean 47.928
std 36.876
min 0.044
25% 18.738
50% 40.028
75% 69.421
max 195.331
Name: Fare, dtype: float64

<ggplot: (338496543)>