import pandas as pd
data = pd.read_csv("data/svm-hyperparameters-train-features.csv")
| Pclass | Sex | Age | SibSp | Parch | Fare |
---|
0 | 3 | 1 | 22.000 | 1 | 0 | 7.250 |
---|
1 | 1 | 0 | 38.000 | 1 | 0 | 71.283 |
---|
2 | 3 | 0 | 26.000 | 0 | 0 | 7.925 |
---|
3 | 1 | 0 | 35.000 | 1 | 0 | 53.100 |
---|
4 | 3 | 1 | 35.000 | 0 | 0 | 8.050 |
---|
data.describe(include="all")
| Pclass | Sex | Age | SibSp | Parch | Fare |
---|
count | 891.000 | 891.000 | 891.000 | 891.000 | 891.000 | 891.000 |
---|
mean | 2.309 | 0.648 | 29.759 | 0.523 | 0.382 | 32.204 |
---|
std | 0.836 | 0.478 | 13.003 | 1.103 | 0.806 | 49.693 |
---|
min | 1.000 | 0.000 | 0.420 | 0.000 | 0.000 | 0.000 |
---|
25% | 2.000 | 0.000 | 22.000 | 0.000 | 0.000 | 7.910 |
---|
50% | 3.000 | 1.000 | 30.000 | 0.000 | 0.000 | 14.454 |
---|
75% | 3.000 | 1.000 | 35.000 | 1.000 | 0.000 | 31.000 |
---|
max | 3.000 | 1.000 | 80.000 | 8.000 | 6.000 | 512.329 |
---|
from sdv.tabular import GaussianCopula
new_df = model.sample(N_SAMPLES)
| Pclass | Sex | Age | SibSp | Parch | Fare |
---|
0 | 3 | 1 | 12.780 | 1 | 1 | 2.070 |
---|
1 | 2 | 1 | 44.930 | 0 | 0 | 95.806 |
---|
2 | 2 | 1 | 21.980 | 1 | 0 | 72.020 |
---|
3 | 2 | 0 | 30.120 | 2 | 1 | 84.968 |
---|
4 | 2 | 1 | 23.480 | 1 | 1 | 37.170 |
---|
| Pclass | Sex | Age | SibSp | Parch | Fare |
---|
count | 1000.000 | 1000.000 | 1000.000 | 1000.000 | 1000.000 | 1000.000 |
---|
mean | 2.127 | 0.595 | 30.168 | 0.972 | 0.671 | 46.234 |
---|
std | 0.697 | 0.491 | 13.224 | 0.786 | 0.666 | 35.294 |
---|
min | 1.000 | 0.000 | 0.620 | 0.000 | 0.000 | 0.180 |
---|
25% | 2.000 | 0.000 | 20.693 | 0.000 | 0.000 | 16.959 |
---|
50% | 2.000 | 1.000 | 29.795 | 1.000 | 1.000 | 38.102 |
---|
75% | 3.000 | 1.000 | 39.163 | 1.000 | 1.000 | 67.841 |
---|
max | 3.000 | 1.000 | 77.000 | 4.000 | 3.000 | 166.853 |
---|
<ggplot: (338236528)>
<ggplot: (338395612)>
<ggplot: (338577035)>
<ggplot: (338383996)>
model = GaussianCopula(
field_transformers={
'Pclass': 'categorical',
'Sex': 'categorical',
'Age': 'float',
'SibSp': 'boolean',
'Parch': 'integer',
'Fare': 'float'
}
)
new_df = model.sample(N_SAMPLES)
| Pclass | Sex | Age | SibSp | Parch | Fare |
---|
0 | 3 | 0 | 24.330 | 0 | 1 | 24.759 |
---|
1 | 1 | 1 | 58.070 | 0 | 0 | 22.957 |
---|
2 | 2 | 0 | 23.700 | 0 | 0 | 15.298 |
---|
3 | 1 | 1 | 30.200 | 0 | 0 | 19.092 |
---|
4 | 1 | 1 | 51.370 | 0 | 0 | 99.106 |
---|
<ggplot: (338395206)>
<ggplot: (338261950)>
<ggplot: (338558748)>
<ggplot: (338314674)>
count 891.000
mean 32.204
std 49.693
min 0.000
25% 7.910
50% 14.454
75% 31.000
max 512.329
Name: Fare, dtype: float64
distributions = model.get_distributions()
distributions
{'Pclass.value': 'copulas.univariate.truncated_gaussian.TruncatedGaussian',
'Sex.value': 'copulas.univariate.truncated_gaussian.TruncatedGaussian',
'Age.value': 'copulas.univariate.truncated_gaussian.TruncatedGaussian',
'SibSp.value': 'copulas.univariate.truncated_gaussian.TruncatedGaussian',
'Parch.value': 'copulas.univariate.truncated_gaussian.TruncatedGaussian',
'Fare.value': 'copulas.univariate.truncated_gaussian.TruncatedGaussian'}
model = GaussianCopula(
field_transformers={
'Pclass': 'categorical',
'Sex': 'categorical',
'Age': 'float',
'SibSp': 'boolean',
'Parch': 'integer',
'Fare': 'float'
},
field_distributions={
'Fare': 'truncated_gaussian'
}
)
new_df = model.sample(N_SAMPLES)
new_df.Fare.describe()
count 1000.000
mean 47.928
std 36.876
min 0.044
25% 18.738
50% 40.028
75% 69.421
max 195.331
Name: Fare, dtype: float64
<ggplot: (338496543)>