SMILE library random forest classification

Introduction

import $ivy.`dev.ruivieira::scala-experiments:0.1.0-SNAPSHOT`
import ml.smile.data.Loader

val df = Loader.csv("credit_bias-train.csv")
df.summary()
import $ivy.$

import ml.smile.data.Loader


df: smile.data.DataFrame = [NewCreditCustomer: boolean, Amount: double, Interest: double, LoanDuration: int, Education: Double, NrOfDependants: Double, EmploymentDurationCurrentEmployer: Double, IncomeFromPrincipalEmployer: double, IncomeFromPension: double, IncomeFromFamilyAllowance: double, IncomeFromSocialWelfare: double, IncomeFromLeavePay: double, IncomeFromChildSupport: double, IncomeOther: double, ExistingLiabilities: int, RefinanceLiabilities: int, DebtToIncome: Double, FreeCash: Double, CreditScoreEeMini: Double, NoOfPreviousLoansBeforeLoan: Double, AmountOfPreviousLoansBeforeLoan: Double, PreviousRepaymentsBeforeLoan: Double, PreviousEarlyRepaymentsBefoleLoan: Double, PreviousEarlyRepaymentsCountBeforeLoan: Double, PaidLoan: boolean, Council_house: int, Homeless: int, Joint_ownership: int, Joint_tenant: int, Living_with_parents: int, Mortgage: int, Other: int, Owner: int, Owner_with_encumbrance: int, Tenant: int, Entrepreneur: int, Fully: int, Partially: int, Retiree: int, Self_employed: int]
+-----------------+------+--------+------------+---------+--------------+---------------------------------+---------------------------+-----------------+-------------------------+-----------------------+------------------+----------------------+-----------+-------------------+--------------------+------------+--------+-----------------+---------------------------+-------------------------------+----------------------------+---------------------------------+--------------------------------------+--------+-------------+--------+---------------+------------+-------------------+--------+-----+-----+----------------------+------+------------+-----+---------+-------+-------------+
|NewCreditCustomer|Amount|Interest|LoanDuration|Education|NrOfDependants|EmploymentDurationCurrentEmployer|IncomeFromPrincipalEmployer|IncomeFromPension|IncomeFromFamilyAllowance|IncomeFromSocialWelfare|IncomeFromLeavePay|IncomeFromChildSupport|IncomeOther|ExistingLiabilities|RefinanceLiabilities|DebtToIncome|FreeCash|CreditScoreEeMini|NoOfPreviousLoansBeforeLoan|AmountOfPreviousLoansBeforeLoan|PreviousRepaymentsBeforeLoan|PreviousEarlyRepaymentsBefoleLoan|PreviousEarlyRepaym...
res0_3: smile.data.DataFrame = [column: String, count: long, min: double, avg: double, max: double]
+--------------------+-----+----+-----------+------+
|              column|count| min|        avg|   max|
+--------------------+-----+----+-----------+------+
|              Amount|58003|6.39|2242.612586| 10632|
|            Interest|58003|   2|  25.123398| 76.08|
|        LoanDuration|58003|   1|  42.774736|    60|
|           Education|57943|   1|   3.676958|     5|
|      NrOfDependants|20031|   0|   0.815486|    11|
|EmploymentDuratio...|57261|   1|   4.091511|     7|
|IncomeFromPrincip...|58003|   0| 604.457299|133000|
|   IncomeFromPension|58003|   0|   8.875886|  2600|
|IncomeFromFamilyA...|58003|   0|   6.863071|  2006|
|IncomeFromSocialW...|58003|   0|   1.425823|   972|
+--------------------+-----+----+-----------+------+
28 more rows...
import smile.data.DataFrame

val subset: DataFrame = df.select("Amount", "LoanDuration", "PaidLoan")
subset.summary()
import smile.data.DataFrame


subset: DataFrame = [Amount: double, LoanDuration: int, PaidLoan: boolean]
+------+------------+--------+
|Amount|LoanDuration|PaidLoan|
+------+------------+--------+
|  2125|          60|   false|
|  3000|          60|   false|
|  9100|          60|   false|
|   635|          60|   false|
|  5000|          60|    true|
|  2000|          60|    true|
|   530|          60|    true|
|  5500|          60|    true|
|  6900|          60|    true|
|  3190|          60|   false|
+------+------------+--------+
57993 more rows...

res1_2: DataFrame = [column: String, count: long, min: double, avg: double, max: double]
+------------+-----+----+-----------+-----+
|      column|count| min|        avg|  max|
+------------+-----+----+-----------+-----+
|      Amount|58003|6.39|2242.612586|10632|
|LoanDuration|58003|   1|  42.774736|   60|
+------------+-----+----+-----------+-----+
import smile.data.formula.Formula

val formula: Formula = Formula.lhs("PaidLoan")
import smile.data.formula.Formula


formula: Formula = PaidLoan ~ .
import smile.classification.RandomForest

val a = RandomForest.fit(formula, subset)
import smile.classification.RandomForest


a: RandomForest = smile.classification.RandomForest@45554613
import smile.data.`type`.{DataTypes, StructField}
import smile.data.Tuple

val schema = DataTypes.struct(
  new StructField("Amount", DataTypes.DoubleType),
  new StructField("LoanDuration", DataTypes.DoubleType))

a.predict(Tuple.of(Array(10000000.0, 1000.0), schema))
import smile.data.`type`.{DataTypes, StructField}

import smile.data.Tuple


schema: smile.data.type.StructType = [Amount: double, LoanDuration: double]
res4_3: Int = 0