Synthetic data with SVD and Gaussian copulas

import pandas as pd

data = pd.read_csv("data/svm-hyperparameters-train-features.csv")
data.head()

PclassSexAgeSibSpParchFare
03122.000107.250
11038.0001071.283
23026.000007.925
31035.0001053.100
43135.000008.050
data.describe(include="all")

PclassSexAgeSibSpParchFare
count891.000891.000891.000891.000891.000891.000
mean2.3090.64829.7590.5230.38232.204
std0.8360.47813.0031.1030.80649.693
min1.0000.0000.4200.0000.0000.000
25%2.0000.00022.0000.0000.0007.910
50%3.0001.00030.0000.0000.00014.454
75%3.0001.00035.0001.0000.00031.000
max3.0001.00080.0008.0006.000512.329
from sdv.tabular import GaussianCopula
model = GaussianCopula()
model.fit(data)
N_SAMPLES = 1000
new_df = model.sample(N_SAMPLES)
new_df.head()

PclassSexAgeSibSpParchFare
03112.780112.070
12144.9300095.806
22121.9801072.020
32030.1202184.968
42123.4801137.170
new_df.describe()

PclassSexAgeSibSpParchFare
count1000.0001000.0001000.0001000.0001000.0001000.000
mean2.1270.59530.1680.9720.67146.234
std0.6970.49113.2240.7860.66635.294
min1.0000.0000.6200.0000.0000.180
25%2.0000.00020.6930.0000.00016.959
50%2.0001.00029.7951.0001.00038.102
75%3.0001.00039.1631.0001.00067.841
max3.0001.00077.0004.0003.000166.853

<ggplot: (338236528)>

<ggplot: (338395612)>

<ggplot: (338577035)>

<ggplot: (338383996)>
model = GaussianCopula(
    field_transformers={
        'Pclass': 'categorical',
        'Sex': 'categorical',
        'Age': 'float',
        'SibSp': 'boolean',
        'Parch': 'integer',
        'Fare': 'float'
    }
)
model.fit(data)
new_df = model.sample(N_SAMPLES)
new_df.head()

PclassSexAgeSibSpParchFare
03024.3300124.759
11158.0700022.957
22023.7000015.298
31130.2000019.092
41151.3700099.106

<ggplot: (338395206)>

<ggplot: (338261950)>

<ggplot: (338558748)>

<ggplot: (338314674)>
data.Fare.describe()
count   891.000
mean     32.204
std      49.693
min       0.000
25%       7.910
50%      14.454
75%      31.000
max     512.329
Name: Fare, dtype: float64
distributions = model.get_distributions()
distributions
{'Pclass.value': 'copulas.univariate.truncated_gaussian.TruncatedGaussian',
 'Sex.value': 'copulas.univariate.truncated_gaussian.TruncatedGaussian',
 'Age.value': 'copulas.univariate.truncated_gaussian.TruncatedGaussian',
 'SibSp.value': 'copulas.univariate.truncated_gaussian.TruncatedGaussian',
 'Parch.value': 'copulas.univariate.truncated_gaussian.TruncatedGaussian',
 'Fare.value': 'copulas.univariate.truncated_gaussian.TruncatedGaussian'}
model = GaussianCopula(
    field_transformers={
        'Pclass': 'categorical',
        'Sex': 'categorical',
        'Age': 'float',
        'SibSp': 'boolean',
        'Parch': 'integer',
        'Fare': 'float'
    },
    field_distributions={
        'Fare': 'truncated_gaussian'
    }
)
model.fit(data)
new_df = model.sample(N_SAMPLES)
new_df.Fare.describe()
count   1000.000
mean      47.928
std       36.876
min        0.044
25%       18.738
50%       40.028
75%       69.421
max      195.331
Name: Fare, dtype: float64

<ggplot: (338496543)>