Random forests

Let’s train a random forest.

from typing import List

import pandas as pd

df_train = pd.read_csv("data/titanic/train.csv")
df_test = pd.read_csv("data/titanic/test.csv")
df_train = df_train[["Survived", "Pclass", "Sex", "Age", "Fare"]]

from sklearn import preprocessing

le = preprocessing.LabelEncoder()

df_train["Sex"] = le.fit_transform(df_train["Sex"])
df_train["Sex"] = df_train["Sex"].astype("category")
df_train["Survived"] = df_train["Survived"].astype("category")
df_train = df_train.dropna()
from sklearn.ensemble import RandomForestClassifier

input_names = ['Age', 'Sex']

classifier = RandomForestClassifier()
classifier.fit(df_train[input_names], df_train['Survived'])
RandomForestClassifier()

Visualise decision trees

from sklearn.tree import export_text

tree = export_text(classifier.estimators_[0], feature_names=input_names)
print(tree[0:1000])
|--- Sex <= 0.50
|   |--- Age <= 48.50
|   |   |--- Age <= 42.50
|   |   |   |--- Age <= 21.50
|   |   |   |   |--- Age <= 19.50
|   |   |   |   |   |--- Age <= 18.50
|   |   |   |   |   |   |--- Age <= 17.50
|   |   |   |   |   |   |   |--- Age <= 14.75
|   |   |   |   |   |   |   |   |--- Age <= 14.25
|   |   |   |   |   |   |   |   |   |--- Age <= 12.00
|   |   |   |   |   |   |   |   |   |   |--- Age <= 5.50
|   |   |   |   |   |   |   |   |   |   |   |--- truncated branch of depth 3
|   |   |   |   |   |   |   |   |   |   |--- Age >  5.50
|   |   |   |   |   |   |   |   |   |   |   |--- truncated branch of depth 3
|   |   |   |   |   |   |   |   |   |--- Age >  12.00
|   |   |   |   |   |   |   |   |   |   |--- class: 1.0
|   |   |   |   |   |   |   |   |--- Age >  14.25
|   |   |   |   |   |   |   |   |   |--- class: 0.0
|   |   |   |   |   |   |   |--- Age >  14.75
|   |   |   |   |   |   |   |   |--- class: 1.0
|   |   |   |   |   |   |--- Age >  17.50
|   |   |   |   |   |   |