Generating synthetic data

Synthetic data will be used mainly for these scenarios:

  • Regression

  • Classification

Regression data

What does a regression consist of?

For this section we will mainly use scikit-learn ’s make_regression method.

For reproducibility, we will set a random_state .

random_state = 23

We will create a dataset using make_regression ’s random linear regression model with input features \(x=(f_1,f_2,f_3,f_4)\) and an output \(y\) .

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.datasets import make_regression

N_FEATURES = 4
N_TARGETS = 1
N_SAMPLES = 100

dataset = make_regression(
    n_samples=N_SAMPLES,
    n_features=N_FEATURES,
    n_informative=2,
    n_targets=N_TARGETS,
    bias=0.0,
    effective_rank=None,
    tail_strength=0.5,
    noise=0.0,
    shuffle=True,
    coef=False,
    random_state=random_state,
)

print(dataset[0][:10])
print(dataset[1][:10])
[[ 0.87305874 -1.63096187  0.52538404 -0.19035824]
 [ 1.00698671  0.79834941 -0.04057655 -0.31358605]
 [-0.61464273  1.65110321  0.75791487 -0.0039844 ]
 [-1.08536678  1.82337823  0.4612592  -1.72325306]
 [-1.67774847 -0.54401341  0.86347869 -0.30250463]
 [-0.02427254  0.75537599 -0.04644972 -0.85153564]
 [-0.48085576  0.82100952 -0.9390196  -0.25870492]
 [-0.66772841 -2.46244005 -0.19855095 -1.85756579]
 [-0.29810663 -0.02239635  0.25363492 -1.22688366]
 [ 1.48146924  0.38269965 -1.18208819 -1.31062148]]
[  20.00449025  -30.41054677   52.65371365 -119.26376184   33.78805456
  -78.12189078  -88.41673748 -177.21674804  -90.13920313 -197.90799195]

Let’s turn this dataset into a Pandas DataFrame :

df = pd.DataFrame(data=dataset[0], columns=[f"f{i+1}" for i in range(N_FEATURES)])

df["y"] = dataset[1]
df.head()
f1 f2 f3 f4 y
0 0.873059 -1.630962 0.525384 -0.190358 20.004490
1 1.006987 0.798349 -0.040577 -0.313586 -30.410547
2 -0.614643 1.651103 0.757915 -0.003984 52.653714
3 -1.085367 1.823378 0.461259 -1.723253 -119.263762
4 -1.677748 -0.544013 0.863479 -0.302505 33.788055

Let’s plot the data:

from plotutils import *


def plot_regression(df, size):
    for i in range(size):
        fit = np.polyfit(df[df.columns[i]], df["y"], 1)
        fit_fn = np.poly1d(fit)
        plt.subplot(2, 2, i + 1)
        plt.xlabel("y")
        plt.ylabel(f"f{i+1}")
        plt.scatter(df[df.columns[i]], df["y"], s=30, c=colours[1], edgecolor=edges[1])
        plt.plot(
            df[df.columns[i]], fit_fn(df[df.columns[i]]), ls="--", c=colours[0], lw=1
        )


plot_regression(df, N_FEATURES)
_images/data-synthetic_9_0.png

Changing the Gaussian noise level

The noise parameter in make_regression allows to adjust the scale of the data’s gaussian centered noise.

dataset = make_regression(
    n_samples=N_SAMPLES,
    n_features=N_FEATURES,
    n_informative=2,
    n_targets=N_TARGETS,
    bias=0.0,
    effective_rank=None,
    tail_strength=0.5,
    noise=2.0,
    shuffle=True,
    coef=False,
    random_state=random_state,
)

df = pd.DataFrame(data=dataset[0], columns=[f"f{i+1}" for i in range(N_FEATURES)])

df["y"] = dataset[1]
plot_regression(df, N_FEATURES)
_images/data-synthetic_13_0.png

Visualising increasing noise

Let’s increase the noise by \(10^i\) , for \(i=1, 2, 3\) and see what the data looks like.

df = pd.DataFrame(data=np.zeros((N_SAMPLES, 1)))


def create_noisy_data(noise):
    return make_regression(
        n_samples=N_SAMPLES,
        n_features=1,
        n_informative=1,
        n_targets=1,
        bias=0.0,
        effective_rank=None,
        tail_strength=0.5,
        noise=noise,
        shuffle=True,
        coef=False,
        random_state=random_state,
    )


for i in range(3):
    data = create_noisy_data(10 ** i)

    df[f"f{i+1}"] = data[0]
    df[f"y{i+1}"] = data[1]
for i in range(3):
    fit = np.polyfit(df[f"f{i+1}"], df[f"y{i+1}"], 1)
    fit_fn = np.poly1d(fit)
    plt.subplot(1, 3, i + 1)
    plt.scatter(df[f"f{i+1}"], df[f"y{i+1}"], s=30, c=colours[1], edgecolor=edges[1])
    plt.plot(
        df[f"f{i+1}"],
        fit_fn(df[f"f{i+1}"]),
        ls="--",
        color=colours[0],
        lw=1,
    )
    plt.xlabel(f"f{i+1}")
    plt.ylabel(f"y{i+1}")
_images/data-synthetic_16_0.png

Classification data

To generate data for classification we will use the make_classification method.

from sklearn.datasets import make_classification

N = 4

data = make_classification(
    n_samples=N_SAMPLES,
    n_features=N,
    n_informative=4,
    n_redundant=0,
    n_repeated=0,
    n_classes=2,
    n_clusters_per_class=1,
    weights=None,
    flip_y=0.01,
    class_sep=1.0,
    hypercube=True,
    shift=0.0,
    scale=1.0,
    shuffle=True,
    random_state=random_state,
)

df = pd.DataFrame(data[0], columns=[f"f{i+1}" for i in range(N)])

df["y"] = data[1]
df.head()
f1 f2 f3 f4 y
0 -3.216 -0.416 -1.295 -1.882 0
1 -1.426 -1.257 -1.734 -1.804 0
2 2.798 -3.010 -1.085 -3.134 1
3 0.633 2.502 -1.553 1.625 1
4 1.494 0.912 -1.887 -1.457 1
from itertools import combinations
from math import ceil

lst_var = list(combinations(df.columns[:-1], 2))
len_var = len(lst_var)

for i in range(1, len_var + 1):
    plt.subplot(2, ceil(len_var / 2), i)
    var1 = lst_var[i - 1][0]
    var2 = lst_var[i - 1][1]
    plt.scatter(
        df[var1],
        df[var2],
        s=50,
        c=df["y"].apply(lambda y: colours[y]),
        edgecolor=df["y"].apply(lambda y: edges[y]),
    )
    plt.xlabel(var1)
    plt.ylabel(var2)
_images/data-synthetic_20_0.png

Cluster separation

According to the docs 2 , class_sep is the factor multiplying the hypercube size.

Larger values spread out the clusters/classes and make the classification task easier.

N_FEATURES = 4

data = make_classification(
    n_samples=N_SAMPLES,
    n_features=N_FEATURES,
    n_informative=4,
    n_redundant=0,
    n_repeated=0,
    n_classes=2,
    n_clusters_per_class=1,
    weights=None,
    flip_y=0.01,
    class_sep=3.0,
    hypercube=True,
    shift=0.0,
    scale=1.0,
    shuffle=True,
    random_state=None,
)

df = pd.DataFrame(data[0], columns=[f"f{i+1}" for i in range(N_FEATURES)])

df["y"] = data[1]
from itertools import combinations
from math import ceil

lst_var = list(combinations(df.columns[:-1], 2))
len_var = len(lst_var)

for i in range(1, len_var + 1):
    plt.subplot(2, ceil(len_var / 2), i)
    var1 = lst_var[i - 1][0]
    var2 = lst_var[i - 1][1]
    plt.scatter(
        df[var1],
        df[var2],
        s=50,
        c=df["y"].apply(lambda y: colours[y]),
        edgecolor=df["y"].apply(lambda y: edges[y]),
    )
    plt.xlabel(var1)
    plt.ylabel(var2)
_images/data-synthetic_23_0.png

We can make the cluster separability more difficult, by decreasing the value of class_sep .

N_FEATURES = 4

data = make_classification(
    n_samples=N_SAMPLES,
    n_features=N_FEATURES,
    n_informative=4,
    n_redundant=0,
    n_repeated=0,
    n_classes=2,
    n_clusters_per_class=1,
    weights=None,
    flip_y=0.01,
    class_sep=0.5,
    hypercube=True,
    shift=0.0,
    scale=1.0,
    shuffle=True,
    random_state=None,
)

df = pd.DataFrame(data[0], columns=[f"f{i+1}" for i in range(N_FEATURES)])

df["y"] = data[1]
from itertools import combinations
from math import ceil

lst_var = list(combinations(df.columns[:-1], 2))
len_var = len(lst_var)

for i in range(1, len_var + 1):
    plt.subplot(2, ceil(len_var / 2), i)
    var1 = lst_var[i - 1][0]
    var2 = lst_var[i - 1][1]
    plt.scatter(
        df[var1],
        df[var2],
        s=50,
        c=df["y"].apply(lambda y: colours[y]),
        edgecolor=df["y"].apply(lambda y: edges[y]),
    )
    plt.xlabel(var1)
    plt.ylabel(var2)
_images/data-synthetic_26_0.png

Noise level

According to the documentation 2 , flip_y is the fraction of samples whose class is assigned randomly.

Larger values introduce noise in the labels and make the classification task harder.

N_FEATURES = 4


for i in range(6):
    data = make_classification(
        n_samples=N_SAMPLES,
        n_features=N_FEATURES,
        n_informative=4,
        n_redundant=0,
        n_repeated=0,
        n_classes=2,
        n_clusters_per_class=1,
        weights=None,
        flip_y=0.1 * i,
        class_sep=1.0,
        hypercube=True,
        shift=0.0,
        scale=1.0,
        shuffle=False,
        random_state=random_state,
    )
    df = pd.DataFrame(data[0], columns=[f"f{i+1}" for i in range(N_FEATURES)])
    df["y"] = data[1]
    plt.subplot(2, 3, i + 1)
    plt.title(f"flip_y={round(0.1*i,2)}")
    plt.scatter(
        df["f1"],
        df["f2"],
        s=50,
        c=df["y"].apply(lambda y: colours[y]),
        edgecolor=df["y"].apply(lambda y: edges[y]),
    )
plt.tight_layout(pad=3.0)
_images/data-synthetic_28_0.png
df = pd.DataFrame(data=np.zeros((N_SAMPLES, 1)))

for i in range(3):
    data = make_classification(
        n_samples=N_SAMPLES,
        n_features=2,
        n_informative=2,
        n_redundant=0,
        n_repeated=0,
        n_classes=2,
        n_clusters_per_class=1,
        weights=None,
        flip_y=0,
        class_sep=i + 0.5,
        hypercube=True,
        shift=0.0,
        scale=1.0,
        shuffle=False,
        random_state=random_state,
    )
    df[f"f{i+1}1"] = data[0][:, 0]
    df[f"f{i+1}2"] = data[0][:, 1]
    df[f"t{i+1}"] = data[1]

for i in range(3):
    plt.subplot(1, 3, i + 1)
    plt.scatter(
        df[f"f{i+1}1"],
        df[f"f{i+1}2"],
        s=50,
        c=df[f"t{i+1}"].apply(lambda y: colours[y]),
        edgecolor=df[f"t{i+1}"].apply(lambda y: edges[y]),
    )
_images/data-synthetic_29_0.png

Separability

from sklearn.datasets import make_blobs

N_FEATURE = 4

data = make_blobs(
    n_samples=60,
    n_features=N_FEATURE,
    centers=3,
    cluster_std=1.0,
    center_box=(-5.0, 5.0),
    shuffle=True,
    random_state=None,
)
df = pd.DataFrame(data[0], columns=[f"f{i+1}" for i in range(N_FEATURE)])
df["y"] = data[1]
from itertools import combinations
from math import ceil

lst_var = list(combinations(df.columns[:-1], 2))
len_var = len(lst_var)
for i in range(1, len_var + 1):
    plt.subplot(2, ceil(len_var / 2), i)
    var1 = lst_var[i - 1][0]
    var2 = lst_var[i - 1][1]
    plt.scatter(
        df[var1],
        df[var2],
        s=50,
        c=df["y"].apply(lambda y: colours[y]),
        edgecolor=df["y"].apply(lambda y: edges[y]),
    )
    plt.xlabel(var1)
    plt.ylabel(var2)
_images/data-synthetic_32_0.png

To make a cluster more separable we can change cluster_std .

data = make_blobs(
    n_samples=60,
    n_features=N_FEATURES,
    centers=3,
    cluster_std=0.3,
    center_box=(-5.0, 5.0),
    shuffle=True,
    random_state=None,
)
df = pd.DataFrame(data[0], columns=[f"f{i+1}" for i in range(N_FEATURES)])
df["y"] = data[1]
from itertools import combinations
from math import ceil

lst_var = list(combinations(df.columns[:-1], 2))
len_var = len(lst_var)
for i in range(1, len_var + 1):
    plt.subplot(2, ceil(len_var / 2), i)
    var1 = lst_var[i - 1][0]
    var2 = lst_var[i - 1][1]
    plt.scatter(
        df[var1],
        df[var2],
        s=50,
        c=df["y"].apply(lambda y: colours[y]),
        edgecolor=df["y"].apply(lambda y: edges[y]),
    )
    plt.xlabel(var1)
    plt.ylabel(var2)
_images/data-synthetic_35_0.png

By decreasing cluster_std we make them less separable.

data = make_blobs(
    n_samples=60,
    n_features=N_FEATURES,
    centers=3,
    cluster_std=2.5,
    center_box=(-5.0, 5.0),
    shuffle=True,
    random_state=None,
)
df = pd.DataFrame(data[0], columns=[f"f{i+1}" for i in range(N_FEATURES)])
df["y"] = data[1]
from itertools import combinations
from math import ceil

lst_var = list(combinations(df.columns[:-1], 2))
len_var = len(lst_var)
for i in range(1, len_var + 1):
    plt.subplot(2, ceil(len_var / 2), i)
    var1 = lst_var[i - 1][0]
    var2 = lst_var[i - 1][1]
    plt.scatter(
        df[var1],
        df[var2],
        s=50,
        c=df["y"].apply(lambda y: colours[y]),
        edgecolor=df["y"].apply(lambda y: edges[y]),
    )
    plt.xlabel(var1)
    plt.ylabel(var2)
_images/data-synthetic_38_0.png

Anisotropic data

data = make_blobs(n_samples=50, n_features=2, centers=3, cluster_std=1.5)
transformation = [[0.5, -0.5], [-0.4, 0.8]]
data_0 = np.dot(data[0], transformation)
df = pd.DataFrame(data_0, columns=[f"f{i}" for i in range(1, 3)])
df["y"] = data[1]
plt.scatter(
    df["f1"],
    df["f2"],
    c=df["y"].apply(lambda y: colours[y]),
    s=50,
    edgecolors=df["y"].apply(lambda y: edges[y]),
)
plt.xlabel("f1")
plt.ylabel("f2")
plt.show()
_images/data-synthetic_43_0.png

Concentric clusters

Sometimes we might be interested in creating a non-separable cluster.

The simples way is to create concentric clusters with the make_circles method.

from sklearn.datasets import make_circles

data = make_circles(
    n_samples=N_SAMPLES, shuffle=True, noise=None, random_state=random_state, factor=0.6
)
df = pd.DataFrame(data[0], columns=[f"f{i+1}" for i in range(2)])
df["y"] = data[1]
plt.scatter(
    df["f1"],
    df["f2"],
    c=df["y"].apply(lambda y: colours[y]),
    s=50,
    edgecolors=df["y"].apply(lambda y: edges[y]),
)
plt.xlabel("f1")
plt.ylabel("f2")
plt.show()
_images/data-synthetic_47_0.png

Adding noise

The noise parameter allows to create a concentric noisy dataset.

data = make_circles(
    n_samples=N_SAMPLES, shuffle=True, noise=0.15, random_state=random_state, factor=0.6
)
df = pd.DataFrame(data[0], columns=[f"f{i+1}" for i in range(2)])
df["y"] = data[1]
plt.scatter(
    df["f1"],
    df["f2"],
    c=df["y"].apply(lambda y: colours[y]),
    s=50,
    edgecolors=df["y"].apply(lambda y: edges[y]),
)
plt.xlabel("f1")
plt.ylabel("f2")
plt.show()
_images/data-synthetic_50_0.png

Moon clusters

A shape that can be useful to other methos (such as counterfactuals, for instance) is the make_moons method.

from sklearn.datasets import make_moons

data = make_moons(
    n_samples=N_SAMPLES, shuffle=True, noise=None, random_state=random_state
)
df = pd.DataFrame(data[0], columns=[f"f{i+1}" for i in range(2)])
df["y"] = data[1]
plt.scatter(
    df["f1"],
    df["f2"],
    c=df["y"].apply(lambda y: colours[y]),
    s=50,
    edgecolors=df["y"].apply(lambda y: edges[y]),
)
plt.xlabel("f1")
plt.ylabel("f2")
plt.show()
_images/data-synthetic_53_0.png

Adding noise

As usual, the noise parameter allows to control the noise.

data = make_moons(
    n_samples=N_SAMPLES, shuffle=True, noise=0.1, random_state=random_state
)
df = pd.DataFrame(data[0], columns=[f"f{i+1}" for i in range(2)])
df["y"] = data[1]
plt.scatter(
    df["f1"],
    df["f2"],
    c=df["y"].apply(lambda y: colours[y]),
    s=50,
    edgecolors=df["y"].apply(lambda y: edges[y]),
)
plt.xlabel("f1")
plt.ylabel("f2")
plt.show()
_images/data-synthetic_56_0.png

2 ( 1 , 2 )

https://scikit-learn.org/stable/modules/generated/sklearn.datasets.make_classification.html