Generating synthetic data

Synthetic data will be used mainly for these scenarios:

Here we will mainly look at the methods provided by scikit-learn to generate synthetic datasets. For more advanced methods, such as using the SDV library please check the SDV page. It support methods such as Gaussian copulas, CTGAN and CopulaGAN.

Regression data

What does a regression consist of?

For this section we will mainly use scikit-learn's make_regression method.

For reproducibility, we will set a random_state.

import warnings
warnings.filterwarnings('ignore')
warnings.simplefilter('ignore')

random_state = 23

We will create a dataset using make_regression's random linear regression model with input features \(x=(f_1,f_2,f_3,f_4)\) and an output \(y\).

import matplotlib.pyplot as plt
from plotnine import *
from plotnine.data import *
import numpy as np
import pandas as pd
from sklearn.datasets import make_regression
from scipy.stats import linregress

N_FEATURES = 4
N_TARGETS = 1
N_SAMPLES = 100

dataset = make_regression(
    n_samples=N_SAMPLES,
    n_features=N_FEATURES,
    n_informative=2,
    n_targets=N_TARGETS,
    bias=0.0,
    effective_rank=None,
    tail_strength=0.5,
    noise=0.0,
    shuffle=True,
    coef=False,
    random_state=random_state,
)

print(dataset[0][:10])
print(dataset[1][:10])
[[ 0.87305874 -1.63096187  0.52538404 -0.19035824]
 [ 1.00698671  0.79834941 -0.04057655 -0.31358605]
 [-0.61464273  1.65110321  0.75791487 -0.0039844 ]
 [-1.08536678  1.82337823  0.4612592  -1.72325306]
 [-1.67774847 -0.54401341  0.86347869 -0.30250463]
 [-0.02427254  0.75537599 -0.04644972 -0.85153564]
 [-0.48085576  0.82100952 -0.9390196  -0.25870492]
 [-0.66772841 -2.46244005 -0.19855095 -1.85756579]
 [-0.29810663 -0.02239635  0.25363492 -1.22688366]
 [ 1.48146924  0.38269965 -1.18208819 -1.31062148]]
[  20.00449025  -30.41054677   52.65371365 -119.26376184   33.78805456
  -78.12189078  -88.41673748 -177.21674804  -90.13920313 -197.90799195]

Let's turn this dataset into a Pandas DataFrame:

df = pd.DataFrame(data=dataset[0], columns=[f"f{i+1}" for i in range(N_FEATURES)])

df["y"] = dataset[1]
df.head()
f1 f2 f3 f4 y
0 0.873059 -1.630962 0.525384 -0.190358 20.004490
1 1.006987 0.798349 -0.040577 -0.313586 -30.410547
2 -0.614643 1.651103 0.757915 -0.003984 52.653714
3 -1.085367 1.823378 0.461259 -1.723253 -119.263762
4 -1.677748 -0.544013 0.863479 -0.302505 33.788055

Let's plot the data:

from plotutils import *


def plot_regression(df, size):
    for i in range(size):
        fit = np.polyfit(df[df.columns[i]], df["y"], 1)
        fit_fn = np.poly1d(fit)
        plt.subplot(2, 2, i + 1)
        plt.xlabel("y")
        plt.ylabel(f"f{i+1}")
        plt.scatter(df[df.columns[i]], df["y"], s=30, c=colours[1], edgecolor=edges[1])
        plt.plot(
            df[df.columns[i]], fit_fn(df[df.columns[i]]), ls="--", c=colours[0], lw=1
        )


plot_regression(df, N_FEATURES)

png

Changing the Gaussian noise level

The noise parameter in make_regression allows to adjust the scale of the data's gaussian centered noise.

dataset = make_regression(
    n_samples=N_SAMPLES,
    n_features=N_FEATURES,
    n_informative=2,
    n_targets=N_TARGETS,
    bias=0.0,
    effective_rank=None,
    tail_strength=0.5,
    noise=2.0,
    shuffle=True,
    coef=False,
    random_state=random_state,
)

df = pd.DataFrame(data=dataset[0], columns=[f"f{i+1}" for i in range(N_FEATURES)])

df["y"] = dataset[1]
plot_regression(df, N_FEATURES)

png

Visualising increasing noise

Let's increase the noise by \(10^i\), for \(i=1, 2, 3\) and see what the data looks like.

df = pd.DataFrame(data=np.zeros((N_SAMPLES, 1)))


def create_noisy_data(noise):
    return make_regression(
        n_samples=N_SAMPLES,
        n_features=1,
        n_informative=1,
        n_targets=1,
        bias=0.0,
        effective_rank=None,
        tail_strength=0.5,
        noise=noise,
        shuffle=True,
        coef=False,
        random_state=random_state,
    )


for i in range(3):
    data = create_noisy_data(10 ** i)

    df[f"f{i+1}"] = data[0]
    df[f"y{i+1}"] = data[1]
for i in range(3):
    fit = np.polyfit(df[f"f{i+1}"], df[f"y{i+1}"], 1)
    fit_fn = np.poly1d(fit)
    plt.subplot(1, 3, i + 1)
    plt.scatter(df[f"f{i+1}"], df[f"y{i+1}"], s=30, c=colours[1], edgecolor=edges[1])
    plt.plot(
        df[f"f{i+1}"],
        fit_fn(df[f"f{i+1}"]),
        ls="--",
        color=colours[0],
        lw=1,
    )
    plt.xlabel(f"f{i+1}")
    plt.ylabel(f"y{i+1}")

png

(data:classification)=

Classification data

To generate data for classification we will use the make_classification method.

from sklearn.datasets import make_classification

N = 4

data = make_classification(
    n_samples=N_SAMPLES,
    n_features=N,
    n_informative=4,
    n_redundant=0,
    n_repeated=0,
    n_classes=2,
    n_clusters_per_class=1,
    weights=None,
    flip_y=0.01,
    class_sep=1.0,
    hypercube=True,
    shift=0.0,
    scale=1.0,
    shuffle=True,
    random_state=random_state,
)

df = pd.DataFrame(data[0], columns=[f"f{i+1}" for i in range(N)])

df["y"] = data[1]
df.head()
f1 f2 f3 f4 y
0 -3.216 -0.416 -1.295 -1.882 0
1 -1.426 -1.257 -1.734 -1.804 0
2 2.798 -3.010 -1.085 -3.134 1
3 0.633 2.502 -1.553 1.625 1
4 1.494 0.912 -1.887 -1.457 1
from itertools import combinations
from math import ceil

lst_var = list(combinations(df.columns[:-1], 2))
len_var = len(lst_var)

for i in range(1, len_var + 1):
    plt.subplot(2, ceil(len_var / 2), i)
    var1 = lst_var[i - 1][0]
    var2 = lst_var[i - 1][1]
    plt.scatter(
        df[var1],
        df[var2],
        s=50,
        c=df["y"].apply(lambda y: colours[y]),
        edgecolor=df["y"].apply(lambda y: edges[y]),
    )
    plt.xlabel(var1)
    plt.ylabel(var2)

png

Cluster separation

According to the docs1, class_sep is the factor multiplying the hypercube size.

Larger values spread out the clusters/classes and make the classification task easier.

N_FEATURES = 4

data = make_classification(
    n_samples=N_SAMPLES,
    n_features=N_FEATURES,
    n_informative=4,
    n_redundant=0,
    n_repeated=0,
    n_classes=2,
    n_clusters_per_class=1,
    weights=None,
    flip_y=0.01,
    class_sep=3.0,
    hypercube=True,
    shift=0.0,
    scale=1.0,
    shuffle=True,
    random_state=None,
)

df = pd.DataFrame(data[0], columns=[f"f{i+1}" for i in range(N_FEATURES)])

df["y"] = data[1]
from itertools import combinations
from math import ceil

lst_var = list(combinations(df.columns[:-1], 2))
len_var = len(lst_var)

for i in range(1, len_var + 1):
    plt.subplot(2, ceil(len_var / 2), i)
    var1 = lst_var[i - 1][0]
    var2 = lst_var[i - 1][1]
    plt.scatter(
        df[var1],
        df[var2],
        s=50,
        c=df["y"].apply(lambda y: colours[y]),
        edgecolor=df["y"].apply(lambda y: edges[y]),
    )
    plt.xlabel(var1)
    plt.ylabel(var2)

png

We can make the cluster separability more difficult, by decreasing the value of class_sep.

N_FEATURES = 4

data = make_classification(
    n_samples=N_SAMPLES,
    n_features=N_FEATURES,
    n_informative=4,
    n_redundant=0,
    n_repeated=0,
    n_classes=2,
    n_clusters_per_class=1,
    weights=None,
    flip_y=0.01,
    class_sep=0.5,
    hypercube=True,
    shift=0.0,
    scale=1.0,
    shuffle=True,
    random_state=None,
)

df = pd.DataFrame(data[0], columns=[f"f{i+1}" for i in range(N_FEATURES)])

df["y"] = data[1]
from itertools import combinations
from math import ceil

lst_var = list(combinations(df.columns[:-1], 2))
len_var = len(lst_var)

for i in range(1, len_var + 1):
    plt.subplot(2, ceil(len_var / 2), i)
    var1 = lst_var[i - 1][0]
    var2 = lst_var[i - 1][1]
    plt.scatter(
        df[var1],
        df[var2],
        s=50,
        c=df["y"].apply(lambda y: colours[y]),
        edgecolor=df["y"].apply(lambda y: edges[y]),
    )
    plt.xlabel(var1)
    plt.ylabel(var2)

png

Noise level

According to the documentation1, flip_y is the fraction of samples whose class is assigned randomly.

Larger values introduce noise in the labels and make the classification task harder.

N_FEATURES = 4


for i in range(6):
    data = make_classification(
        n_samples=N_SAMPLES,
        n_features=N_FEATURES,
        n_informative=4,
        n_redundant=0,
        n_repeated=0,
        n_classes=2,
        n_clusters_per_class=1,
        weights=None,
        flip_y=0.1 * i,
        class_sep=1.0,
        hypercube=True,
        shift=0.0,
        scale=1.0,
        shuffle=False,
        random_state=random_state,
    )
    df = pd.DataFrame(data[0], columns=[f"f{i+1}" for i in range(N_FEATURES)])
    df["y"] = data[1]
    plt.subplot(2, 3, i + 1)
    plt.title(f"flip_y={round(0.1*i,2)}")
    plt.scatter(
        df["f1"],
        df["f2"],
        s=50,
        c=df["y"].apply(lambda y: colours[y]),
        edgecolor=df["y"].apply(lambda y: edges[y]),
    )
plt.tight_layout(pad=3.0)

png

df = pd.DataFrame(data=np.zeros((N_SAMPLES, 1)))

for i in range(3):
    data = make_classification(
        n_samples=N_SAMPLES,
        n_features=2,
        n_informative=2,
        n_redundant=0,
        n_repeated=0,
        n_classes=2,
        n_clusters_per_class=1,
        weights=None,
        flip_y=0,
        class_sep=i + 0.5,
        hypercube=True,
        shift=0.0,
        scale=1.0,
        shuffle=False,
        random_state=random_state,
    )
    df[f"f{i+1}1"] = data[0][:, 0]
    df[f"f{i+1}2"] = data[0][:, 1]
    df[f"t{i+1}"] = data[1]

for i in range(3):
    plt.subplot(1, 3, i + 1)
    plt.scatter(
        df[f"f{i+1}1"],
        df[f"f{i+1}2"],
        s=50,
        c=df[f"t{i+1}"].apply(lambda y: colours[y]),
        edgecolor=df[f"t{i+1}"].apply(lambda y: edges[y]),
    )

png

It is noteworthy that many paremeters in scikit-learn for synthetic data generation allow inputs per feature or cluster. To do so, we simple pass the parameter value as an array. For instance, to

N = 4

data = make_classification(
    n_samples=N_SAMPLES,
    n_features=N,
    n_informative=4,
    n_redundant=0,
    n_repeated=0,
    n_classes=2,
    n_clusters_per_class=1,
    weights=None,
    flip_y=0.01,
    class_sep=1.0,
    hypercube=True,
    shift=0.0,
    scale=1.0,
    shuffle=True,
    random_state=random_state,
)

df = pd.DataFrame(data[0], columns=[f"f{i+1}" for i in range(N)])

df["y"] = data[1]

Separability

from sklearn.datasets import make_blobs

N_FEATURE = 4

data = make_blobs(
    n_samples=60,
    n_features=N_FEATURE,
    centers=3,
    cluster_std=1.0,
    center_box=(-5.0, 5.0),
    shuffle=True,
    random_state=None,
)
df = pd.DataFrame(data[0], columns=[f"f{i+1}" for i in range(N_FEATURE)])
df["y"] = data[1]
from itertools import combinations
from math import ceil

lst_var = list(combinations(df.columns[:-1], 2))
len_var = len(lst_var)
for i in range(1, len_var + 1):
    plt.subplot(2, ceil(len_var / 2), i)
    var1 = lst_var[i - 1][0]
    var2 = lst_var[i - 1][1]
    plt.scatter(
        df[var1],
        df[var2],
        s=50,
        c=df["y"].apply(lambda y: colours[y]),
        edgecolor=df["y"].apply(lambda y: edges[y]),
    )
    plt.xlabel(var1)
    plt.ylabel(var2)

png

To make a cluster more separable we can change cluster_std.

data = make_blobs(
    n_samples=60,
    n_features=N_FEATURES,
    centers=3,
    cluster_std=0.3,
    center_box=(-5.0, 5.0),
    shuffle=True,
    random_state=None,
)
df = pd.DataFrame(data[0], columns=[f"f{i+1}" for i in range(N_FEATURES)])
df["y"] = data[1]
from itertools import combinations
from math import ceil

lst_var = list(combinations(df.columns[:-1], 2))
len_var = len(lst_var)
for i in range(1, len_var + 1):
    plt.subplot(2, ceil(len_var / 2), i)
    var1 = lst_var[i - 1][0]
    var2 = lst_var[i - 1][1]
    plt.scatter(
        df[var1],
        df[var2],
        s=50,
        c=df["y"].apply(lambda y: colours[y]),
        edgecolor=df["y"].apply(lambda y: edges[y]),
    )
    plt.xlabel(var1)
    plt.ylabel(var2)

png

By decreasing cluster_std we make them less separable.

data = make_blobs(
    n_samples=60,
    n_features=N_FEATURES,
    centers=3,
    cluster_std=2.5,
    center_box=(-5.0, 5.0),
    shuffle=True,
    random_state=None,
)
df = pd.DataFrame(data[0], columns=[f"f{i+1}" for i in range(N_FEATURES)])
df["y"] = data[1]
from itertools import combinations
from math import ceil

lst_var = list(combinations(df.columns[:-1], 2))
len_var = len(lst_var)
for i in range(1, len_var + 1):
    plt.subplot(2, ceil(len_var / 2), i)
    var1 = lst_var[i - 1][0]
    var2 = lst_var[i - 1][1]
    plt.scatter(
        df[var1],
        df[var2],
        s=50,
        c=df["y"].apply(lambda y: colours[y]),
        edgecolor=df["y"].apply(lambda y: edges[y]),
    )
    plt.xlabel(var1)
    plt.ylabel(var2)

png

Anisotropic data

data = make_blobs(n_samples=50, n_features=2, centers=3, cluster_std=1.5)
transformation = [[0.5, -0.5], [-0.4, 0.8]]
data_0 = np.dot(data[0], transformation)
df = pd.DataFrame(data_0, columns=[f"f{i}" for i in range(1, 3)])
df["y"] = data[1]
plt.scatter(
    df["f1"],
    df["f2"],
    c=df["y"].apply(lambda y: colours[y]),
    s=50,
    edgecolors=df["y"].apply(lambda y: edges[y]),
)
plt.xlabel("f1")
plt.ylabel("f2")
plt.show()

png

Concentric clusters

Sometimes we might be interested in creating a non-separable cluster.

The simples way is to create concentric clusters with the make_circles method.

from sklearn.datasets import make_circles

data = make_circles(
    n_samples=N_SAMPLES, shuffle=True, noise=None, random_state=random_state, factor=0.6
)
df = pd.DataFrame(data[0], columns=[f"f{i+1}" for i in range(2)])
df["y"] = data[1]
plt.scatter(
    df["f1"],
    df["f2"],
    c=df["y"].apply(lambda y: colours[y]),
    s=50,
    edgecolors=df["y"].apply(lambda y: edges[y]),
)
plt.xlabel("f1")
plt.ylabel("f2")
plt.show()

png

Adding noise

The noise parameter allows to create a concentric noisy dataset.

data = make_circles(
    n_samples=N_SAMPLES, shuffle=True, noise=0.15, random_state=random_state, factor=0.6
)
df = pd.DataFrame(data[0], columns=[f"f{i+1}" for i in range(2)])
df["y"] = data[1]
plt.scatter(
    df["f1"],
    df["f2"],
    c=df["y"].apply(lambda y: colours[y]),
    s=50,
    edgecolors=df["y"].apply(lambda y: edges[y]),
)
plt.xlabel("f1")
plt.ylabel("f2")
plt.show()

png

Moon clusters

A shape that can be useful to other methos (such as counterfactuals, for instance) is the one generated by the make_moons method.

from sklearn.datasets import make_moons

data = make_moons(
    n_samples=N_SAMPLES, shuffle=True, noise=None, random_state=random_state
)
df = pd.DataFrame(data[0], columns=[f"f{i+1}" for i in range(2)])
df["y"] = data[1]
plt.scatter(
    df["f1"],
    df["f2"],
    c=df["y"].apply(lambda y: colours[y]),
    s=50,
    edgecolors=df["y"].apply(lambda y: edges[y]),
)
plt.xlabel("f1")
plt.ylabel("f2")
plt.show()

png

Adding noise

As usual, the noise parameter allows to control the noise.

data = make_moons(
    n_samples=N_SAMPLES, shuffle=True, noise=0.1, random_state=random_state
)
df = pd.DataFrame(data[0], columns=[f"f{i+1}" for i in range(2)])
df["y"] = data[1]
plt.scatter(
    df["f1"],
    df["f2"],
    c=df["y"].apply(lambda y: colours[y]),
    s=50,
    edgecolors=df["y"].apply(lambda y: edges[y]),
)
plt.xlabel("f1")
plt.ylabel("f2")
plt.show()

png



  1. https://scikit-learn.org/stable/modules/generated/sklearn.datasets.make_classification.html