Synthetic data with SDV and CopulaGAN

import pandas as pd
import warnings

warnings.filterwarnings('ignore')

data = pd.read_csv("data/svm-hyperparameters-train-features.csv")

data.head()
Pclass Sex Age SibSp Parch Fare
0 3 1 22.0 1 0 7.2500
1 1 0 38.0 1 0 71.2833
2 3 0 26.0 0 0 7.9250
3 1 0 35.0 1 0 53.1000
4 3 1 35.0 0 0 8.0500
data.describe(include='all')
Pclass Sex Age SibSp Parch Fare
count 891.000000 891.000000 891.000000 891.000000 891.000000 891.000000
mean 2.308642 0.647587 29.758889 0.523008 0.381594 32.204208
std 0.836071 0.477990 13.002570 1.102743 0.806057 49.693429
min 1.000000 0.000000 0.420000 0.000000 0.000000 0.000000
25% 2.000000 0.000000 22.000000 0.000000 0.000000 7.910400
50% 3.000000 1.000000 30.000000 0.000000 0.000000 14.454200
75% 3.000000 1.000000 35.000000 1.000000 0.000000 31.000000
max 3.000000 1.000000 80.000000 8.000000 6.000000 512.329200
from sdv.tabular import CopulaGAN
model = CopulaGAN()
model.fit(data)
new_data = model.sample(200)
new_data.head()
Pclass Sex Age SibSp Parch Fare
0 3 0 32.02 0 0 18.2784
1 3 1 20.90 5 0 109.6910
2 3 0 40.80 0 1 179.5139
3 3 0 33.13 1 0 17.3447
4 1 1 20.62 0 0 9.6040
new_data.describe(include='all')
Pclass Sex Age SibSp Parch Fare
count 200.000000 200.000000 200.000000 200.000000 200.000000 200.000000
mean 2.490000 0.450000 31.404700 0.405000 0.115000 46.756109
std 0.820528 0.498742 12.297721 0.919348 0.461345 52.828023
min 1.000000 0.000000 0.450000 0.000000 0.000000 3.034900
25% 2.000000 0.000000 24.712500 0.000000 0.000000 11.768475
50% 3.000000 0.000000 30.295000 0.000000 0.000000 27.859150
75% 3.000000 1.000000 34.307500 1.000000 0.000000 58.361350
max 3.000000 1.000000 73.340000 7.000000 3.000000 381.859600
from sdv.evaluation import evaluate

evaluate(new_data, data)
0.539229052965023
model = CopulaGAN(
    field_transformers={
        'Pclass': 'categorical',
        'Sex': 'categorical',
        'Age': 'float',
        'SibSp': 'boolean',
        'Parch': 'integer',
        'Fare': 'float'
    },
    field_distributions={
        'Fare': 'truncated_gaussian'
    }
)
model.fit(data)
new_data = model.sample(200)
new_data.head()
Pclass Sex Age SibSp Parch Fare
0 1 0 1.07 1 0 13.8318
1 3 0 15.58 0 0 46.6937
2 1 1 26.53 0 0 53.8841
3 1 0 29.58 0 0 5.2646
4 1 0 30.13 0 0 5.1415
new_data.describe(include='all')
Pclass Sex Age SibSp Parch Fare
count 200.000000 200.000000 200.000000 200.000000 200.000000 200.000000
mean 2.210000 0.410000 22.935600 0.550000 0.145000 49.155439
std 0.932873 0.493068 13.955182 0.498742 0.441531 42.733997
min 1.000000 0.000000 0.430000 0.000000 0.000000 4.749400
25% 1.000000 0.000000 13.427500 0.000000 0.000000 13.340950
50% 3.000000 0.000000 25.875000 1.000000 0.000000 34.373700
75% 3.000000 1.000000 30.225000 1.000000 0.000000 75.206275
max 3.000000 1.000000 71.340000 1.000000 3.000000 220.761200
evaluate(new_data, data)
0.4983713994365315