import pandas as pd
import warnings
warnings.filterwarnings('ignore')
data = pd.read_csv("data/svm-hyperparameters-train-features.csv")
data.head()
| Pclass | Sex | Age | SibSp | Parch | Fare |
---|
0 | 3 | 1 | 22.0 | 1 | 0 | 7.2500 |
---|
1 | 1 | 0 | 38.0 | 1 | 0 | 71.2833 |
---|
2 | 3 | 0 | 26.0 | 0 | 0 | 7.9250 |
---|
3 | 1 | 0 | 35.0 | 1 | 0 | 53.1000 |
---|
4 | 3 | 1 | 35.0 | 0 | 0 | 8.0500 |
---|
data.describe(include='all')
| Pclass | Sex | Age | SibSp | Parch | Fare |
---|
count | 891.000000 | 891.000000 | 891.000000 | 891.000000 | 891.000000 | 891.000000 |
---|
mean | 2.308642 | 0.647587 | 29.758889 | 0.523008 | 0.381594 | 32.204208 |
---|
std | 0.836071 | 0.477990 | 13.002570 | 1.102743 | 0.806057 | 49.693429 |
---|
min | 1.000000 | 0.000000 | 0.420000 | 0.000000 | 0.000000 | 0.000000 |
---|
25% | 2.000000 | 0.000000 | 22.000000 | 0.000000 | 0.000000 | 7.910400 |
---|
50% | 3.000000 | 1.000000 | 30.000000 | 0.000000 | 0.000000 | 14.454200 |
---|
75% | 3.000000 | 1.000000 | 35.000000 | 1.000000 | 0.000000 | 31.000000 |
---|
max | 3.000000 | 1.000000 | 80.000000 | 8.000000 | 6.000000 | 512.329200 |
---|
from sdv.tabular import CopulaGAN
new_data = model.sample(200)
| Pclass | Sex | Age | SibSp | Parch | Fare |
---|
0 | 3 | 0 | 32.02 | 0 | 0 | 18.2784 |
---|
1 | 3 | 1 | 20.90 | 5 | 0 | 109.6910 |
---|
2 | 3 | 0 | 40.80 | 0 | 1 | 179.5139 |
---|
3 | 3 | 0 | 33.13 | 1 | 0 | 17.3447 |
---|
4 | 1 | 1 | 20.62 | 0 | 0 | 9.6040 |
---|
new_data.describe(include='all')
| Pclass | Sex | Age | SibSp | Parch | Fare |
---|
count | 200.000000 | 200.000000 | 200.000000 | 200.000000 | 200.000000 | 200.000000 |
---|
mean | 2.490000 | 0.450000 | 31.404700 | 0.405000 | 0.115000 | 46.756109 |
---|
std | 0.820528 | 0.498742 | 12.297721 | 0.919348 | 0.461345 | 52.828023 |
---|
min | 1.000000 | 0.000000 | 0.450000 | 0.000000 | 0.000000 | 3.034900 |
---|
25% | 2.000000 | 0.000000 | 24.712500 | 0.000000 | 0.000000 | 11.768475 |
---|
50% | 3.000000 | 0.000000 | 30.295000 | 0.000000 | 0.000000 | 27.859150 |
---|
75% | 3.000000 | 1.000000 | 34.307500 | 1.000000 | 0.000000 | 58.361350 |
---|
max | 3.000000 | 1.000000 | 73.340000 | 7.000000 | 3.000000 | 381.859600 |
---|
from sdv.evaluation import evaluate
evaluate(new_data, data)
0.539229052965023
model = CopulaGAN(
field_transformers={
'Pclass': 'categorical',
'Sex': 'categorical',
'Age': 'float',
'SibSp': 'boolean',
'Parch': 'integer',
'Fare': 'float'
},
field_distributions={
'Fare': 'truncated_gaussian'
}
)
new_data = model.sample(200)
| Pclass | Sex | Age | SibSp | Parch | Fare |
---|
0 | 1 | 0 | 1.07 | 1 | 0 | 13.8318 |
---|
1 | 3 | 0 | 15.58 | 0 | 0 | 46.6937 |
---|
2 | 1 | 1 | 26.53 | 0 | 0 | 53.8841 |
---|
3 | 1 | 0 | 29.58 | 0 | 0 | 5.2646 |
---|
4 | 1 | 0 | 30.13 | 0 | 0 | 5.1415 |
---|
new_data.describe(include='all')
| Pclass | Sex | Age | SibSp | Parch | Fare |
---|
count | 200.000000 | 200.000000 | 200.000000 | 200.000000 | 200.000000 | 200.000000 |
---|
mean | 2.210000 | 0.410000 | 22.935600 | 0.550000 | 0.145000 | 49.155439 |
---|
std | 0.932873 | 0.493068 | 13.955182 | 0.498742 | 0.441531 | 42.733997 |
---|
min | 1.000000 | 0.000000 | 0.430000 | 0.000000 | 0.000000 | 4.749400 |
---|
25% | 1.000000 | 0.000000 | 13.427500 | 0.000000 | 0.000000 | 13.340950 |
---|
50% | 3.000000 | 0.000000 | 25.875000 | 1.000000 | 0.000000 | 34.373700 |
---|
75% | 3.000000 | 1.000000 | 30.225000 | 1.000000 | 0.000000 | 75.206275 |
---|
max | 3.000000 | 1.000000 | 71.340000 | 1.000000 | 3.000000 | 220.761200 |
---|
0.4983713994365315