Contents

Synthetic data with SVD and Gaussian copulas

import pandas as pd

data = pd.read_csv("data/svm-hyperparameters-train-features.csv")
data.head()
   Pclass  Sex   Age  SibSp  Parch     Fare
0       3    1  22.0      1      0   7.2500
1       1    0  38.0      1      0  71.2833
2       3    0  26.0      0      0   7.9250
3       1    0  35.0      1      0  53.1000
4       3    1  35.0      0      0   8.0500
data.describe(include="all")
           Pclass         Sex         Age       SibSp       Parch        Fare
count  891.000000  891.000000  891.000000  891.000000  891.000000  891.000000
mean     2.308642    0.647587   29.758889    0.523008    0.381594   32.204208
std      0.836071    0.477990   13.002570    1.102743    0.806057   49.693429
min      1.000000    0.000000    0.420000    0.000000    0.000000    0.000000
25%      2.000000    0.000000   22.000000    0.000000    0.000000    7.910400
50%      3.000000    1.000000   30.000000    0.000000    0.000000   14.454200
75%      3.000000    1.000000   35.000000    1.000000    0.000000   31.000000
max      3.000000    1.000000   80.000000    8.000000    6.000000  512.329200
from sdv.tabular import GaussianCopula
---------------------------------------------------------------------------
ModuleNotFoundError                       Traceback (most recent call last)
/var/folders/c2/9d2fsqt57t10zn1f2ylp1jxw0000gn/T/ipykernel_62349/3994888440.py in <module>
----> 1 from sdv.tabular import GaussianCopula

ModuleNotFoundError: No module named 'sdv'

```python

model = GaussianCopula()

```

```python

model.fit(data)

```

```python

N_SAMPLES = 1000

```

```python

new_df = model.sample(N_SAMPLES)

```

```python

new_df.head()

```

```python

new_df.describe()

```

```python

from plotnine import *
from plotnine.data import *
from plotutils import *

ggplot() + geom_histogram(data=data, mapping=aes(x='Pclass'), fill=colours[0], bins=3, alpha=0.3) + \
geom_histogram(data=new_df, mapping=aes(x='Pclass'), fill=colours[1], bins=3, alpha=0.3) + \
theme_classic()

```

```python

ggplot() + geom_histogram(data=data, mapping=aes(x='Sex'), fill=colours[0], bins=6, alpha=0.3) + \
geom_histogram(data=new_df, mapping=aes(x='Sex'), fill=colours[1], bins=6, alpha=0.3) + \
theme_classic()

```

```python

ggplot(mapping=aes(x='Age')) + \
geom_density(data=data, fill=colours[0], alpha=0.2) + \
geom_density(data=new_df, fill=colours[1], alpha=0.2) + \
geom_vline(xintercept=data.Age.mean(), linetype='dotted', colour=colours[0]) + \
geom_vline(xintercept=new_df.Age.mean(), linetype='dotted', colour=colours[1]) + \
theme_classic()

```

```python

ggplot(mapping=aes(x='Fare')) + \
geom_density(data=data, fill=colours[0], alpha=0.2) + \
geom_density(data=new_df, fill=colours[1], alpha=0.2) + \
geom_vline(xintercept=data.Fare.mean(), linetype='dotted', colour=colours[0]) + \
geom_vline(xintercept=new_df.Fare.mean(), linetype='dotted', colour=colours[1]) + \
theme_classic()

```

```python

model = GaussianCopula(
    field_transformers={
        'Pclass': 'categorical',
        'Sex': 'categorical',
        'Age': 'float',
        'SibSp': 'boolean',
        'Parch': 'integer',
        'Fare': 'float'
    }
)

```

```python

model.fit(data)

```

```python

new_df = model.sample(N_SAMPLES)

```

```python

new_df.head()

```

```python

ggplot() + geom_histogram(data=data, mapping=aes(x='Pclass'), fill=colours[0], bins=3, alpha=0.3) + \
geom_histogram(data=new_df, mapping=aes(x='Pclass'), fill=colours[1], bins=3, alpha=0.3) + \
theme_classic()

```

```python

ggplot() + geom_histogram(data=data, mapping=aes(x='Sex'), fill=colours[0], bins=6, alpha=0.3) + \
geom_histogram(data=new_df, mapping=aes(x='Sex'), fill=colours[1], bins=6, alpha=0.3) + \
theme_classic()

```

```python

ggplot(mapping=aes(x='Age')) + \
geom_density(data=data, fill=colours[0], alpha=0.2) + \
geom_density(data=new_df, fill=colours[1], alpha=0.2) + \
geom_vline(xintercept=data.Age.mean(), linetype='dotted', colour=colours[0]) + \
geom_vline(xintercept=new_df.Age.mean(), linetype='dotted', colour=colours[1]) + \
theme_classic()

```

```python

ggplot(mapping=aes(x='Fare')) + \
geom_density(data=data, fill=colours[0], alpha=0.2) + \
geom_density(data=new_df, fill=colours[1], alpha=0.2) + \
geom_vline(xintercept=data.Fare.mean(), linetype='dotted', colour=colours[0]) + \
geom_vline(xintercept=new_df.Fare.mean(), linetype='dotted', colour=colours[1]) + \
theme_classic()

```

```python

data.Fare.describe()

```

```python

distributions = model.get_distributions()

```

```python

distributions

```

```python

model = GaussianCopula(
    field_transformers={
        'Pclass': 'categorical',
        'Sex': 'categorical',
        'Age': 'float',
        'SibSp': 'boolean',
        'Parch': 'integer',
        'Fare': 'float'
    },
    field_distributions={
        'Fare': 'truncated_gaussian'
    }
)

```

```python

model.fit(data)

```

```python

new_df = model.sample(N_SAMPLES)

```

```python

new_df.Fare.describe()

```

```python

ggplot(mapping=aes(x='Fare')) + \
geom_density(data=data, fill=colours[0], alpha=0.2) + \
geom_density(data=new_df, fill=colours[1], alpha=0.2) + \
geom_vline(xintercept=data.Fare.mean(), linetype='dotted', colour=colours[0]) + \
geom_vline(xintercept=new_df.Fare.mean(), linetype='dotted', colour=colours[1]) + \
theme_classic()

```

```python

```