Introduction
import $ivy.`dev.ruivieira::scala-experiments:0.1.0-SNAPSHOT`
import ml.smile.data.Loader
val df = Loader.csv("credit_bias-train.csv")
df.summary()
import $ivy.$
import ml.smile.data.Loader
df: smile.data.DataFrame = [NewCreditCustomer: boolean, Amount: double, Interest: double, LoanDuration: int, Education: Double, NrOfDependants: Double, EmploymentDurationCurrentEmployer: Double, IncomeFromPrincipalEmployer: double, IncomeFromPension: double, IncomeFromFamilyAllowance: double, IncomeFromSocialWelfare: double, IncomeFromLeavePay: double, IncomeFromChildSupport: double, IncomeOther: double, ExistingLiabilities: int, RefinanceLiabilities: int, DebtToIncome: Double, FreeCash: Double, CreditScoreEeMini: Double, NoOfPreviousLoansBeforeLoan: Double, AmountOfPreviousLoansBeforeLoan: Double, PreviousRepaymentsBeforeLoan: Double, PreviousEarlyRepaymentsBefoleLoan: Double, PreviousEarlyRepaymentsCountBeforeLoan: Double, PaidLoan: boolean, Council_house: int, Homeless: int, Joint_ownership: int, Joint_tenant: int, Living_with_parents: int, Mortgage: int, Other: int, Owner: int, Owner_with_encumbrance: int, Tenant: int, Entrepreneur: int, Fully: int, Partially: int, Retiree: int, Self_employed: int]
+-----------------+------+--------+------------+---------+--------------+---------------------------------+---------------------------+-----------------+-------------------------+-----------------------+------------------+----------------------+-----------+-------------------+--------------------+------------+--------+-----------------+---------------------------+-------------------------------+----------------------------+---------------------------------+--------------------------------------+--------+-------------+--------+---------------+------------+-------------------+--------+-----+-----+----------------------+------+------------+-----+---------+-------+-------------+
|NewCreditCustomer|Amount|Interest|LoanDuration|Education|NrOfDependants|EmploymentDurationCurrentEmployer|IncomeFromPrincipalEmployer|IncomeFromPension|IncomeFromFamilyAllowance|IncomeFromSocialWelfare|IncomeFromLeavePay|IncomeFromChildSupport|IncomeOther|ExistingLiabilities|RefinanceLiabilities|DebtToIncome|FreeCash|CreditScoreEeMini|NoOfPreviousLoansBeforeLoan|AmountOfPreviousLoansBeforeLoan|PreviousRepaymentsBeforeLoan|PreviousEarlyRepaymentsBefoleLoan|PreviousEarlyRepaym...
res0_3: smile.data.DataFrame = [column: String, count: long, min: double, avg: double, max: double]
+--------------------+-----+----+-----------+------+
| column|count| min| avg| max|
+--------------------+-----+----+-----------+------+
| Amount|58003|6.39|2242.612586| 10632|
| Interest|58003| 2| 25.123398| 76.08|
| LoanDuration|58003| 1| 42.774736| 60|
| Education|57943| 1| 3.676958| 5|
| NrOfDependants|20031| 0| 0.815486| 11|
|EmploymentDuratio...|57261| 1| 4.091511| 7|
|IncomeFromPrincip...|58003| 0| 604.457299|133000|
| IncomeFromPension|58003| 0| 8.875886| 2600|
|IncomeFromFamilyA...|58003| 0| 6.863071| 2006|
|IncomeFromSocialW...|58003| 0| 1.425823| 972|
+--------------------+-----+----+-----------+------+
28 more rows...
import smile.data.DataFrame
val subset: DataFrame = df.select("Amount", "LoanDuration", "PaidLoan")
subset.summary()
import smile.data.DataFrame
subset: DataFrame = [Amount: double, LoanDuration: int, PaidLoan: boolean]
+------+------------+--------+
|Amount|LoanDuration|PaidLoan|
+------+------------+--------+
| 2125| 60| false|
| 3000| 60| false|
| 9100| 60| false|
| 635| 60| false|
| 5000| 60| true|
| 2000| 60| true|
| 530| 60| true|
| 5500| 60| true|
| 6900| 60| true|
| 3190| 60| false|
+------+------------+--------+
57993 more rows...
res1_2: DataFrame = [column: String, count: long, min: double, avg: double, max: double]
+------------+-----+----+-----------+-----+
| column|count| min| avg| max|
+------------+-----+----+-----------+-----+
| Amount|58003|6.39|2242.612586|10632|
|LoanDuration|58003| 1| 42.774736| 60|
+------------+-----+----+-----------+-----+
import smile.data.formula.Formula
val formula: Formula = Formula.lhs("PaidLoan")
import smile.data.formula.Formula
formula: Formula = PaidLoan ~ .
import smile.classification.RandomForest
val a = RandomForest.fit(formula, subset)
import smile.classification.RandomForest
a: RandomForest = smile.classification.RandomForest@45554613
import smile.data.`type`.{DataTypes, StructField}
import smile.data.Tuple
val schema = DataTypes.struct(
new StructField("Amount", DataTypes.DoubleType),
new StructField("LoanDuration", DataTypes.DoubleType))
a.predict(Tuple.of(Array(10000000.0, 1000.0), schema))
import smile.data.`type`.{DataTypes, StructField}
import smile.data.Tuple
schema: smile.data.type.StructType = [Amount: double, LoanDuration: double]
res4_3: Int = 0
