Synthetic data with SVD and Gaussian copulas

import pandas as pd

data = pd.read_csv("data/svm-hyperparameters-train-features.csv")
data.head()
Pclass Sex Age SibSp Parch Fare
0 3 1 22.000 1 0 7.250
1 1 0 38.000 1 0 71.283
2 3 0 26.000 0 0 7.925
3 1 0 35.000 1 0 53.100
4 3 1 35.000 0 0 8.050
data.describe(include="all")
Pclass Sex Age SibSp Parch Fare
count 891.000 891.000 891.000 891.000 891.000 891.000
mean 2.309 0.648 29.759 0.523 0.382 32.204
std 0.836 0.478 13.003 1.103 0.806 49.693
min 1.000 0.000 0.420 0.000 0.000 0.000
25% 2.000 0.000 22.000 0.000 0.000 7.910
50% 3.000 1.000 30.000 0.000 0.000 14.454
75% 3.000 1.000 35.000 1.000 0.000 31.000
max 3.000 1.000 80.000 8.000 6.000 512.329
from sdv.tabular import GaussianCopula
model = GaussianCopula()
model.fit(data)
N_SAMPLES = 1000
new_df = model.sample(N_SAMPLES)
new_df.head()
Pclass Sex Age SibSp Parch Fare
0 3 1 12.780 1 1 2.070
1 2 1 44.930 0 0 95.806
2 2 1 21.980 1 0 72.020
3 2 0 30.120 2 1 84.968
4 2 1 23.480 1 1 37.170
new_df.describe()
Pclass Sex Age SibSp Parch Fare
count 1000.000 1000.000 1000.000 1000.000 1000.000 1000.000
mean 2.127 0.595 30.168 0.972 0.671 46.234
std 0.697 0.491 13.224 0.786 0.666 35.294
min 1.000 0.000 0.620 0.000 0.000 0.180
25% 2.000 0.000 20.693 0.000 0.000 16.959
50% 2.000 1.000 29.795 1.000 1.000 38.102
75% 3.000 1.000 39.163 1.000 1.000 67.841
max 3.000 1.000 77.000 4.000 3.000 166.853

<ggplot: (338236528)>

<ggplot: (338395612)>

<ggplot: (338577035)>

<ggplot: (338383996)>
model = GaussianCopula(
    field_transformers={
        'Pclass': 'categorical',
        'Sex': 'categorical',
        'Age': 'float',
        'SibSp': 'boolean',
        'Parch': 'integer',
        'Fare': 'float'
    }
)
model.fit(data)
new_df = model.sample(N_SAMPLES)
new_df.head()
Pclass Sex Age SibSp Parch Fare
0 3 0 24.330 0 1 24.759
1 1 1 58.070 0 0 22.957
2 2 0 23.700 0 0 15.298
3 1 1 30.200 0 0 19.092
4 1 1 51.370 0 0 99.106

<ggplot: (338395206)>

<ggplot: (338261950)>

<ggplot: (338558748)>

<ggplot: (338314674)>
data.Fare.describe()
count   891.000
mean     32.204
std      49.693
min       0.000
25%       7.910
50%      14.454
75%      31.000
max     512.329
Name: Fare, dtype: float64
distributions = model.get_distributions()
distributions
{'Pclass.value': 'copulas.univariate.truncated_gaussian.TruncatedGaussian',
 'Sex.value': 'copulas.univariate.truncated_gaussian.TruncatedGaussian',
 'Age.value': 'copulas.univariate.truncated_gaussian.TruncatedGaussian',
 'SibSp.value': 'copulas.univariate.truncated_gaussian.TruncatedGaussian',
 'Parch.value': 'copulas.univariate.truncated_gaussian.TruncatedGaussian',
 'Fare.value': 'copulas.univariate.truncated_gaussian.TruncatedGaussian'}
model = GaussianCopula(
    field_transformers={
        'Pclass': 'categorical',
        'Sex': 'categorical',
        'Age': 'float',
        'SibSp': 'boolean',
        'Parch': 'integer',
        'Fare': 'float'
    },
    field_distributions={
        'Fare': 'truncated_gaussian'
    }
)
model.fit(data)
new_df = model.sample(N_SAMPLES)
new_df.Fare.describe()
count   1000.000
mean      47.928
std       36.876
min        0.044
25%       18.738
50%       40.028
75%       69.421
max      195.331
Name: Fare, dtype: float64

<ggplot: (338496543)>