grade
home_ownership
addr_state
verification_status
purpose
initial_list_status
term_int
emp_length_int
mths_since_issue_d
int_rate_factor
funded_amnt_factor
mths_since_earliest_cr_line_factor
delinq_2yrs
inq_last_6mths
open_acc
pub_rec
total_acc_factor
acc_now_delinq
total_rev_hi_lim_factor
installment_factor
annual_inc_factor
mths_since_last_delinq_factor
mths_since_last_record_factor
In this project we will model lending club data available at: https://www.kaggle.com/husainsb/lendingclub-issued-loans#lc_loan.csv We start with a data between 2007-2015. We will first analyse features, calculate Weight of Evidence and Information Value. Based on that, we will use Generalised logistic regression to calculate probability of default, and finally use coefficients of logistic regression to generate a credit score card.
We will also validate the performance of our predictions. After that, we will predict EAD and LGD.
Next, we use the data for 2016-2017 and treat it as a new data that needs to be integrated with our model. However, before we can do that we have to quantify how similar/dissimilar new data is from existing data. We will do this via Population Stability Index(PSI).
Let us start by loading data and dropping all columns with all the values missing, as these columns are of no use anyways.
library(tidyverse)
library(dplyr)
library(readxl)
library(stringr)
library(purrr)
library(lubridate)
library(caret)
library(ggplot2)
library(data.table)
library(knitr)
loan_data <- fread("C:\\Asus WebStorage\\nitin.7785@gmail.com\\MySyncFolder\\Credit_Risk_Modelling\\Credit-Risk-Model\\publish_git\\data\\lc_loan.csv")
Our data has following columns,
print(colnames(loan_data))
## [1] "id" "member_id"
## [3] "loan_amnt" "funded_amnt"
## [5] "funded_amnt_inv" "term"
## [7] "int_rate" "installment"
## [9] "grade" "sub_grade"
## [11] "emp_title" "emp_length"
## [13] "home_ownership" "annual_inc"
## [15] "verification_status" "issue_d"
## [17] "loan_status" "pymnt_plan"
## [19] "url" "desc"
## [21] "purpose" "title"
## [23] "zip_code" "addr_state"
## [25] "dti" "delinq_2yrs"
## [27] "earliest_cr_line" "inq_last_6mths"
## [29] "mths_since_last_delinq" "mths_since_last_record"
## [31] "open_acc" "pub_rec"
## [33] "revol_bal" "revol_util"
## [35] "total_acc" "initial_list_status"
## [37] "out_prncp" "out_prncp_inv"
## [39] "total_pymnt" "total_pymnt_inv"
## [41] "total_rec_prncp" "total_rec_int"
## [43] "total_rec_late_fee" "recoveries"
## [45] "collection_recovery_fee" "last_pymnt_d"
## [47] "last_pymnt_amnt" "next_pymnt_d"
## [49] "last_credit_pull_d" "collections_12_mths_ex_med"
## [51] "mths_since_last_major_derog" "policy_code"
## [53] "application_type" "annual_inc_joint"
## [55] "dti_joint" "verification_status_joint"
## [57] "acc_now_delinq" "tot_coll_amt"
## [59] "tot_cur_bal" "open_acc_6m"
## [61] "open_il_6m" "open_il_12m"
## [63] "open_il_24m" "mths_since_rcnt_il"
## [65] "total_bal_il" "il_util"
## [67] "open_rv_12m" "open_rv_24m"
## [69] "max_bal_bc" "all_util"
## [71] "total_rev_hi_lim" "inq_fi"
## [73] "total_cu_tl" "inq_last_12m"
Let us examine two columns emp_length
and term
distinct(loan_data, emp_length)
distinct(loan_data, term)
We would like to convert them from string to numeric,
numextract <- function(string){
str_extract(string, "\\-*\\d+\\.*\\d*")
}
loan_data <- loan_data %>%
mutate(emp_length_int = gsub("< 1 year", "0", emp_length))
loan_data$emp_length_int <- map_chr(loan_data$emp_length_int, numextract) %>%
as.double() %>%
replace_na(0)
loan_data <- loan_data %>%
mutate(term_int = ifelse(term == '36 months', 36, 60))
distinct(loan_data, term_int)
Next we convert mths_since_earliest_cr_line and mths_since_earliest_cr_line, to date and calculate months upto 01-12-2017.
loan_data$mths_since_earliest_cr_line <- interval(parse_date_time(loan_data$earliest_cr_line, "by"),
as.POSIXct("2017-12-01"))%/% months(1)
loan_data$mths_since_earliest_cr_line <- ifelse(loan_data$mths_since_earliest_cr_line < 0,
max(loan_data$mths_since_earliest_cr_line, na.rm = TRUE),
loan_data$mths_since_earliest_cr_line)
loan_data$mths_since_issue_d <- interval(parse_date_time(loan_data$issue_d, "by"),
as.POSIXct("2017-12-01"))%/% months(1)
Check for missing values and fill them with as appropriate,
# sapply(loan_data, function(y) sum(length(which(is.na(y)))))
# 'Total revolving high credit/ credit limit', so it makes sense that the missing values are equal to funded_amnt.
loan_data$total_rev_hi_lim <- ifelse(is.na(loan_data$total_rev_hi_lim),
loan_data$funded_amnt, loan_data$total_rev_hi_lim)
# missing annual income is replace by mean annual income
loan_data$annual_inc <- ifelse(is.na(loan_data$annual_inc),
mean(loan_data$annual_inc, na.rm = TRUE), loan_data$annual_inc)
# for the rest fill Na with 0
loan_data$mths_since_earliest_cr_line <- ifelse(is.na(loan_data$mths_since_earliest_cr_line),
0, loan_data$mths_since_earliest_cr_line)
loan_data$acc_now_delinq <- ifelse(is.na(loan_data$acc_now_delinq), 0, loan_data$acc_now_delinq)
loan_data$total_acc <- ifelse(is.na(loan_data$total_acc), 0, loan_data$total_acc)
loan_data$pub_rec <- ifelse(is.na(loan_data$pub_rec), 0, loan_data$pub_rec)
loan_data$open_acc <- ifelse(is.na(loan_data$open_acc), 0, loan_data$open_acc)
loan_data$inq_last_6mths <- ifelse(is.na(loan_data$inq_last_6mths), 0, loan_data$inq_last_6mths)
loan_data$delinq_2yrs <- ifelse(is.na(loan_data$delinq_2yrs), 0, loan_data$delinq_2yrs)
loan_data$emp_length_int <- ifelse(is.na(loan_data$emp_length_int), 0, loan_data$emp_length_int)
As we will not work with all the columns, we only keep columns listed below,
# remove unwanted columns
keep_columns <- c("id", "grade",
"home_ownership", "loan_status", "addr_state",
"verification_status", "purpose",
"initial_list_status", "term_int",
"emp_length_int", "mths_since_issue_d",
"int_rate", "funded_amnt",
"mths_since_earliest_cr_line", "delinq_2yrs",
"inq_last_6mths", "open_acc",
"pub_rec", "total_acc",
"acc_now_delinq", "total_rev_hi_lim",
"installment", "annual_inc",
"mths_since_last_delinq", "dti",
"mths_since_last_record", "total_rec_prncp",
"total_rec_int", "total_rec_late_fee", "recoveries")
loan_data <- loan_data %>%
select(keep_columns)
To model PD we need to convert each loan_status
to default or no-default. Lets first examine the different types of loan_status
we have in the data,
print(distinct(loan_data, loan_status))
## loan_status
## 1: Fully Paid
## 2: Charged Off
## 3: Current
## 4: Default
## 5: Late (31-120 days)
## 6: In Grace Period
## 7: Late (16-30 days)
## 8: Does not meet the credit policy. Status:Fully Paid
## 9: Does not meet the credit policy. Status:Charged Off
## 10: Issued
We then convert loan_status to good/bad behaviour,
distinct(loan_data, loan_status)
loan_data %>%
group_by(loan_status) %>%
summarise(n_distinct(id)/nrow(loan_data))
# create good_bad (default or not) column from loan_status and observe the frequency of good/bad loans
bad_status <- c('Charged Off', 'Default',
'Does not meet the credit policy. Status:Charged Off',
'Late (31-120 days)')
loan_data$good_bad <- ifelse(loan_data$loan_status %in% bad_status, 0, 1)
loan_data %>% group_by(good_bad) %>% summarise(n_distinct(id)/nrow(loan_data))
So, now our data has been preprocessed and we can move to feature exploration and feature engineering to prepare our data for logistic regression. Lets first divide our data into train and test,
train_index <- createDataPartition(loan_data$good_bad,p=0.9,list=FALSE)
train_data <- loan_data[train_index,]
test_data <- loan_data[-train_index,]
We will engineer/modify our columns based on their Weight of Evidence (WoE) and Information Value (IV). Since we will be doing it for many columns, it makes sense to write a function for calculating WoE. To decide on the suitable categories of a feature, we need to look at WoE per original category and if possible combine various categories to reduce the number of parameters in our logistic regression model. To inspect WoE for each feature per category is best done in visual way and we write function plot WoE per feature. Since our data can be either continuous or discrete in nature, we need separate functions to deal with each type.
calculate_weight_of_evidence_discrete <- function(train_data, WoE_col, target_col) {
df_input <- train_data
df1 <- df_input %>%
select_at(c(WoE_col, target_col))
df_n_obs <- df1 %>% group_by_at(WoE_col) %>% summarise(n_obs = n())
df_good_prop <- df1 %>% group_by_at(WoE_col) %>% summarise(prop_good = mean(get(target_col)))
# eval(as.name(target_col)))
df_final <- df_n_obs %>%
left_join(df_good_prop)
df_final$prop_n_obs <- df_final$n_obs / sum(df_final$n_obs)
df_final$n_good <- df_final$prop_good * df_final$n_obs
df_final$n_bad <- (1-df_final$prop_good) * df_final$n_obs
df_final$prop_n_good <- df_final$n_good / sum(df_final$n_good)
df_final$prop_n_bad <- df_final$n_bad / sum(df_final$n_bad)
df_final$WoE <- log(df_final$prop_n_good / df_final$prop_n_bad)
df_final <- df_final %>%
arrange(WoE)
df_final <- df_final %>%
mutate(diff_prop_good = prop_good - lag(prop_good, default = first(prop_good)),
diff_WoE = WoE - lag(WoE, default = first(WoE)))
df_final$WoE <- replace_na(df_final$WoE, 0)
df_final$WoE <- ifelse(is.finite(df_final$WoE), df_final$WoE, 0)
df_final <- df_final %>%
mutate(IV = sum((prop_n_good - prop_n_bad) * WoE))
print(paste0(WoE_col, " IV is: ", distinct(df_final, IV)))
df_final <- df_final %>%
arrange(WoE) %>%
mutate(prop_obs = n_obs / sum(n_obs))
}
plot_woe_discrete <- function(plot_data, x_axis, x_label, y_scale, angl) {
ggplot(data = plot_data, aes(x = reorder(get(x_axis), WoE))) +
geom_line(aes(y = WoE, group = 1), color = 'blue') +
geom_point(aes(y = WoE, group = 1), color = 'blue') +
geom_bar(aes(y = prop_obs / y_scale, group = 1), stat="identity", size=.1, color = 'red', fill = "#69b3a2", alpha=.4) +
theme(
axis.title.y = element_text(color = 'blue', size=13),
axis.title.y.right = element_text(color = 'red', size=13)
) +
theme(axis.text.x = element_text(angle = angl, vjust = 0.5, hjust=1)) +
scale_y_continuous(sec.axis = sec_axis(~.*y_scale, name = "Observation proportion")) +
scale_colour_manual(values = c("blue", "red")) +
labs(y = "Weight of evidence", x = x_label)
}
calculate_weight_of_evidence_continuous <- function(train_data, WoE_col, target_col) {
df_input <- train_data
df1 <- df_input %>%
select_at(c(WoE_col, target_col))
df_n_obs <- df1 %>% group_by_at(WoE_col) %>% summarise(n_obs = n())
df_good_prop <- df1 %>% group_by_at(WoE_col) %>% summarise(prop_good = mean(get(target_col)))
# eval(as.name(target_col)))
df_final <- df_n_obs %>%
left_join(df_good_prop)
df_final$prop_n_obs <- df_final$n_obs / sum(df_final$n_obs)
df_final$n_good <- df_final$prop_good * df_final$n_obs
df_final$n_bad <- (1-df_final$prop_good) * df_final$n_obs
df_final$prop_n_good <- df_final$n_good / sum(df_final$n_good)
df_final$prop_n_bad <- df_final$n_bad / sum(df_final$n_bad)
df_final$WoE <- log(df_final$prop_n_good / df_final$prop_n_bad)
df_final <- df_final %>%
arrange(WoE)
df_final <- df_final %>%
mutate(diff_prop_good = prop_good - lag(prop_good, default = first(prop_good)),
diff_WoE = WoE - lag(WoE, default = first(WoE)))
df_final$WoE <- replace_na(df_final$WoE, 0)
df_final$WoE <- ifelse(is.finite(df_final$WoE), df_final$WoE, 0)
df_final <- df_final %>%
mutate(IV = sum((prop_n_good - prop_n_bad) * WoE))
print(paste0(WoE_col, " IV is: ", distinct(df_final, IV)))
df_final <- df_final %>%
arrange_at(WoE_col) %>%
mutate(prop_obs = n_obs / sum(n_obs))
}
plot_woe_continuous <- function(plot_data, x_axis, x_label, y_scale) {
ggplot(data = plot_data, aes(x = get(x_axis), WoE)) +
geom_line(aes(y = WoE, group = 1), color = 'blue') +
geom_point(aes(y = WoE, group = 1), color = 'blue') +
geom_bar(aes(y = prop_obs / y_scale, group = 1), stat="identity", size=.1, color = 'red', fill = "#69b3a2", alpha=.4) +
theme(
axis.title.y = element_text(color = 'blue', size=13),
axis.title.y.right = element_text(color = 'red', size=13)
) +
scale_x_continuous(breaks = seq(min(plot_data %>% select_at(x_axis)), max(plot_data %>% select_at(x_axis)), by = 1)) +
scale_y_continuous(sec.axis = sec_axis(~.*y_scale, name = "Observation proportion")) +
scale_colour_manual(values = c("blue", "red")) +
labs(y = "Weight of evidence", x = x_label)
}
Lets start inspecting WoE of features and categorize them accordingly. However, before we start feature inspection, we initialize a dataframe to keep track of Information Value of each feature to be able to identify redundant features based on their respective IV.
IV_df <- data.frame(matrix(ncol = 2, nrow = 0))
colnames(IV_df) <- c('WoE_col', 'IV_value')
grade
We are all set to explore and engineer features, lets start with grade
. As can be seen in WoE plot, there are not that many grades and the grades when arranged in ascending order of their WoE have a clear linear trend. It makes sense to preserve all grades categories.
grade_woe <- calculate_weight_of_evidence_discrete(train_data, "grade", "good_bad")
## [1] "grade IV is: 0.322122592378725"
IV_df[nrow(IV_df) + 1,] <- list("grade", distinct(grade_woe, IV))
plot_woe_discrete(grade_woe, 'grade', 'Grade', 1, 0)
train_data$grade_trnsfrmd <- train_data$grade
home_ownership
Next we examine home_ownership
.
distinct(train_data, home_ownership)
home_ownership_woe <- calculate_weight_of_evidence_discrete(train_data, "home_ownership", "good_bad")
## [1] "home_ownership IV is: 0.0228438412849011"
IV_df[nrow(IV_df) + 1,] <- list("home_ownership", distinct(home_ownership_woe, IV))
IV_df[nrow(IV_df) + 1,] <- list("grade", distinct(home_ownership_woe, IV))
plot_woe_discrete(home_ownership_woe, 'home_ownership', 'Home Ownership', 5, 0)
combine_home_ownership <- c("RENT", "OTHER", "NONE", "ANY")
train_data$home_ownership_trnsfrmd <- ifelse(train_data$home_ownership %in% combine_home_ownership, "RENT_OTHER_NONE_ANY", train_data$home_ownership)
WoE plot shows large negative WoE values for OTHER
and NONE
, but notice the small percentage of observations with those categories, this is also true for category ANY
. Hence, they are not reliable and can be clubed together with RENT
and ANY
. The next two categories OWN
and MORTGAGE
have substantial number of observations and differing WoE and they can be kept as separate categories (see plot below where OTHER
and NONE
have been removed for better visibility of frequency of observations).
plot_woe_discrete(home_ownership_woe %>% slice(3:n()), 'home_ownership', 'Home Ownership', 5, 0)
In general, this is the principle that will be followed to decide on categories for PD modelling. Below, we will repeat these steps for all the other features of the input data. From here onwards, we proceed with all other feature categorisation without additional explanation as the basic rational of selecting categories has been explained above.
addr_state
state_woe <- calculate_weight_of_evidence_discrete(train_data, "addr_state", "good_bad")
## [1] "addr_state IV is: 0.017898691052361"
IV_df[nrow(IV_df) + 1,] <- list("addr_state", distinct(state_woe, IV))
plot_woe_discrete(state_woe %>% slice(2:(n()-1)), 'addr_state', 'State', 0.2, 0)
state1 <- c("NV", "NE", "IA", "ME")
state2 <- c("HI", "FL", "AL", "LA")
state3 <- c("NY")
state4 <- c("NC", "MD", "NM", "VA", "NJ", "UT", "MO")
state5 <- c("CA")
state6 <- c("AZ", "AR", "MI")
state7 <- c("OK", "TN", "PA")
state8 <- c("DE", "MA", "RI", "KY", "MN", "OH", "SD", "IN", "OR", "GA", "WA", "ID", "WI", "MT")
state9 <- c("TX")
state10 <- c("VT", "CT", "IL", "AK", "SC", "CO", "KS", "MS")
state11 <- c("NH", "WV", "WY")
state12 <- c("DC")
train_data$addr_state_trnsfrmd <- ifelse(train_data$addr_state %in% state1, "state_list_1",
ifelse(train_data$addr_state %in% state2, "state_list_2",
ifelse(train_data$addr_state %in% state3, "state_list_3",
ifelse(train_data$addr_state %in% state4, "state_list_4",
ifelse(train_data$addr_state %in% state5, "state_list_5",
ifelse(train_data$addr_state %in% state6, "state_list_6",
ifelse(train_data$addr_state %in% state7, "state_list_7",
ifelse(train_data$addr_state %in% state8, "state_list_8",
ifelse(train_data$addr_state %in% state9, "state_list_9",
ifelse(train_data$addr_state %in% state10, "state_list_10",
ifelse(train_data$addr_state %in% state11, "state_list_11",
ifelse(train_data$addr_state %in% state12, "state_list_12",
NaN))))))))))))
verification_status
verification_status_woe <- calculate_weight_of_evidence_discrete(train_data, "verification_status", "good_bad")
## [1] "verification_status IV is: 0.0359161144044958"
IV_df[nrow(IV_df) + 1,] <- list("verification_status", distinct(verification_status_woe, IV))
plot_woe_discrete(verification_status_woe, 'verification_status', 'Verification Status', 5, 0)
train_data$verification_status_trnsfrmd <- ifelse(train_data$verification_status %in%
c("Not Verified", "Source Verified"),
"Not or Source Verified",
train_data$verification_status)
purpose
purpose_woe <- calculate_weight_of_evidence_discrete(train_data, "purpose", "good_bad")
## [1] "purpose IV is: 0.0502648847593456"
IV_df[nrow(IV_df) + 1,] <- list("purpose", distinct(purpose_woe, IV))
plot_woe_discrete(purpose_woe, 'purpose', 'Purpose', 2, 0)
purpose1 <- c("small_business", "educational", "renewable_energy", "moving")
purpose2 <- c("other", "medical", "house")
purpose3 <- c("debt_consolidation", "vacation", "home_improvement")
purpose4 <- c("wedding", "car", "major_purchase", "credit_card")
train_data$purpose_trnsfrmd <- ifelse(train_data$purpose %in% purpose1, "purpose_list_1",
ifelse(train_data$purpose %in% purpose2, "purpose_list_2",
ifelse(train_data$purpose %in% purpose3, "purpose_list_3",
ifelse(train_data$purpose %in% purpose4, "purpose_list_4", NaN))))
initial_list_status
init_list_status_woe <- calculate_weight_of_evidence_discrete(train_data, "initial_list_status", "good_bad")
## [1] "initial_list_status IV is: 0.153102356011061"
IV_df[nrow(IV_df) + 1,] <- list("initial_list_status", distinct(init_list_status_woe, IV))
plot_woe_discrete(init_list_status_woe, 'initial_list_status', 'Initial List Status', 2, 0)
train_data$initial_list_status_trnsfrmd <- train_data$initial_list_status
term_int
term_int_woe <- calculate_weight_of_evidence_continuous(train_data, 'term_int', 'good_bad')
## [1] "term_int IV is: 0.0208232197620997"
IV_df[nrow(IV_df) + 1,] <- list("term_int", distinct(term_int_woe, IV))
plot_woe_continuous(term_int_woe, 'term_int', 'Loan Term', 2)
train_data$term_int_trnsfrmd <- ifelse(train_data$term_int == 36, 'term_36',
ifelse(train_data$term_int == 60, 'term_60', NaN))
emp_length_int
emp_length_int_woe <- calculate_weight_of_evidence_continuous(train_data, 'emp_length_int', 'good_bad')
## [1] "emp_length_int IV is: 0.00947114613881647"
IV_df[nrow(IV_df) + 1,] <- list("emp_length_int", distinct(emp_length_int_woe, IV))
plot_woe_continuous(emp_length_int_woe %>% arrange(emp_length_int), 'emp_length_int', 'Employment Length', 3)
train_data$emp_length_int_trnsfrmd <- ifelse(train_data$emp_length_int %in% c(0), 'emp_length_0',
ifelse(train_data$emp_length_int %in% c(1,2,3,4), 'emp_length_1_4',
ifelse(train_data$emp_length_int %in% c(5,6), 'emp_length_5_6',
ifelse(train_data$emp_length_int %in% c(7,8,9), 'emp_length_7_9',
'emp_length_10'))))
mths_since_issue_d
mths_since_issue_d_woe <- calculate_weight_of_evidence_continuous(train_data, 'mths_since_issue_d', 'good_bad')
## [1] "mths_since_issue_d IV is: 0.918043906280894"
IV_df[nrow(IV_df) + 1,] <- list("mths_since_issue_d", distinct(mths_since_issue_d_woe, IV))
plot_woe_continuous(mths_since_issue_d_woe, 'mths_since_issue_d', 'Months Since Issue Date', 0.1)
train_data$mths_since_issue_d_trnsfrmd <- ifelse(train_data$mths_since_issue_d <= 55, paste0('mths_since_issue_date=',str(train_data$mths_since_issue_d)),
ifelse(train_data$mths_since_issue_d %in% c(56,57,58,59,60,61,62,63), 'mths_since_issue_date>=56<=63',
ifelse(train_data$mths_since_issue_d %in% seq(64,84), 'mths_since_issue_date>63<=84',
ifelse(train_data$mths_since_issue_d > 84, 'mths_since_issue_data>84', NaN))))
## num [1:798642] 71 71 71 71 71 71 71 71 71 71 ...
int_rate_factor
train_data <- train_data %>%
arrange(int_rate)
train_data$int_rate_factor <- cut(train_data$int_rate, 50)
int_rate_factor_woe <- calculate_weight_of_evidence_continuous(train_data, 'int_rate_factor', 'good_bad')
## [1] "int_rate_factor IV is: 0.556944351873259"
IV_df[nrow(IV_df) + 1,] <- list("int_rate_factor", distinct(int_rate_factor_woe, IV))
plot_woe_discrete(int_rate_factor_woe, 'int_rate_factor', 'Interest rate factor', 0.02, 90)
train_data$int_rate_factor_trnsfrmd <- ifelse(train_data$int_rate <= 9.548, 'int_rate<9.548',
ifelse(train_data$int_rate <= 12.025, '9.548<int_rate<=12.025',
ifelse(train_data$int_rate <= 15.74, '12.025<int_rate<=15.74',
ifelse(train_data$int_rate <= 20.281, '15.74<int_rate<20.281',
ifelse(train_data$int_rate >20.281, 'int_rate>20.281',NaN)))))
funded_amnt_factor
train_data$funded_amnt_factor <- cut(train_data$funded_amnt, 50)
funded_amnt_factor_woe <- calculate_weight_of_evidence_continuous(train_data, 'funded_amnt_factor', 'good_bad')
## [1] "funded_amnt_factor IV is: 0.0191689251632706"
IV_df[nrow(IV_df) + 1,] <- list("funded_amnt_factor", distinct(funded_amnt_factor_woe, IV))
plot_woe_discrete(funded_amnt_factor_woe, 'funded_amnt_factor', 'Funded amount factor', 0.02, 90)
train_data$funded_amnt_factor_trnsfrmd <- ifelse(train_data$funded_amnt <= 1000, 'funded_amnt:<5K',
ifelse(train_data$funded_amnt <= 10000, 'funded_amnt:5K-10K',
ifelse(train_data$funded_amnt <= 20000, 'funded_amnt:10K:20K',
ifelse(train_data$funded_amnt <= 30000, 'funded_amnt:20K:30:',
ifelse(train_data$funded_amnt <= 50000, 'funded_amnt:30K:50K',
ifelse(train_data$funded_amnt <= 70000, 'funded_amnt:50K:70K', ifelse(train_data$funded_amnt <= 100000, 'funded_amnt:70K:100K', ifelse(train_data$funded_amnt > 100000, 'funded_amnt:>100K',
NaN))))))))
mths_since_earliest_cr_line_factor
train_data <- train_data %>%
arrange(mths_since_earliest_cr_line)
train_data$mths_since_earliest_cr_line_factor <- cut(train_data$mths_since_earliest_cr_line, 50)
mths_since_earliest_cr_line_woe <- calculate_weight_of_evidence_continuous(train_data, 'mths_since_earliest_cr_line_factor', 'good_bad')
## [1] "mths_since_earliest_cr_line_factor IV is: 0.0151324882437869"
IV_df[nrow(IV_df) + 1,] <- list("mths_since_earliest_cr_line", distinct(mths_since_earliest_cr_line_woe, IV))
plot_woe_discrete(mths_since_earliest_cr_line_woe, 'mths_since_earliest_cr_line_factor', 'Months since earliest credit line', 0.1, 90)
train_data$mths_since_earliest_cr_line_factor_trnsfrmd <- ifelse(train_data$mths_since_earliest_cr_line <= 140, 'mths_since_earliest_cr_line<=140',
ifelse(train_data$mths_since_earliest_cr_line <= 246, '140<mths_since_earliest_cr_line<=246',
ifelse(train_data$mths_since_earliest_cr_line <= 270, '246<mths_since_earliest_cr_line<=270',
ifelse(train_data$mths_since_earliest_cr_line <= 293, '270<mths_since_earliest_cr_line<=293',
ifelse(train_data$mths_since_earliest_cr_line <= 398, '293<mths_since_earliest_cr_line<=398',
ifelse(train_data$mths_since_earliest_cr_line >398, 'mths_since_earliest_cr_line>398',
NaN))))))
delinq_2yrs
delinq_2yrs_woe <- calculate_weight_of_evidence_continuous(train_data, 'delinq_2yrs', 'good_bad')
## [1] "delinq_2yrs IV is: 0.00106847433774323"
IV_df[nrow(IV_df) + 1,] <- list("delinq_2yrs", distinct(delinq_2yrs_woe, IV))
plot_woe_continuous(delinq_2yrs_woe, 'delinq_2yrs', 'Deliquency in last 2 years', 2)
train_data$delinq_2yrs_trnsfrmd <- ifelse(train_data$delinq_2yrs == 0, 'delinq_2yrs=0',
ifelse(train_data$delinq_2yrs == 1, 'delinq_2yrs=1',
ifelse(train_data$delinq_2yrs == 2, 'delinq_2yrs=2',
ifelse(train_data$delinq_2yrs > 2, 'delinq_2yrs>2',
NaN))))
inq_last_6mths
inq_last_6mths_woe <- calculate_weight_of_evidence_continuous(train_data, 'inq_last_6mths', 'good_bad')
## [1] "inq_last_6mths IV is: 0.0925669321100512"
IV_df[nrow(IV_df) + 1,] <- list("inq_last_mths", distinct(inq_last_6mths_woe, IV))
plot_woe_continuous(inq_last_6mths_woe, 'inq_last_6mths', 'Inq last 6 months', 1)
train_data$inq_last_6mths_trnsfrmd <- ifelse(train_data$inq_last_6mths == 0, 'inq_last_6mths=0',
ifelse(train_data$inq_last_6mths == 1, 'inq_last_6mths=1',
ifelse(train_data$inq_last_6mths == 2, 'inq_last_6mths=2',
ifelse(train_data$inq_last_6mths == 3, 'inq_last_6mths=3',
ifelse(train_data$inq_last_6mths > 3, 'inq_last_6mths>3',
NaN)))))
open_acc
open_acc_woe <- calculate_weight_of_evidence_continuous(train_data, 'open_acc', 'good_bad')
## [1] "open_acc IV is: 0.00856851247687463"
IV_df[nrow(IV_df) + 1,] <- list("open_acc", distinct(open_acc_woe, IV))
plot_woe_continuous(open_acc_woe, 'open_acc', 'Open Account', 0.1)
plot_woe_continuous(open_acc_woe %>% filter(open_acc>3, open_acc<20), 'open_acc', 'Open Account', 2)
train_data$open_acc_trnsfrmd <- ifelse(train_data$open_acc %in% c(0,1,2,3,seq(21,max(train_data$open_acc))), 'open_acc=0,1,2,3,21-max',
ifelse(train_data$open_acc %in% c(4,5,6,7), 'open_acc=4,5,6,7',
ifelse(train_data$open_acc %in% c(8), 'open_acc=8',
ifelse(train_data$open_acc %in% c(9,10,11,12), 'open_acc=9,10,11,12',
ifelse(train_data$open_acc %in% c(13,14,15,16), 'open_acc=13,14,15,16',
ifelse(train_data$open_acc %in% c(17), 'open_acc=17',
ifelse(train_data$open_acc %in% c(18,19,20), 'open_acc=18,19,20',
NaN)))))))
pub_rec
pub_rec_woe <- calculate_weight_of_evidence_continuous(train_data, 'pub_rec', 'good_bad')
## [1] "pub_rec IV is: 0.00528665682035972"
IV_df[nrow(IV_df) + 1,] <- list("pub_rec", distinct(pub_rec_woe, IV))
plot_woe_continuous(pub_rec_woe, 'pub_rec', 'Public Records', 1)
train_data$pub_rec_trnsfrmd <- ifelse(train_data$pub_rec == 0, 'pub_rec=0',
ifelse(train_data$pub_rec == 1, 'pub_rec=1',
ifelse(train_data$pub_rec == 2, 'pub_rec=2',
ifelse(train_data$pub_rec >2, 'pub_rec>=3',
NaN))))
total_acc_factor
train_data <- train_data %>%
arrange(total_acc)
train_data$total_acc_factor <- cut(train_data$total_acc, 50)
total_acc_woe <- calculate_weight_of_evidence_continuous(train_data, 'total_acc_factor', 'good_bad')
## [1] "total_acc_factor IV is: 0.0100298483435379"
IV_df[nrow(IV_df) + 1,] <- list("total_acc_factor", distinct(total_acc_woe, IV))
plot_woe_discrete(total_acc_woe, 'total_acc_factor', 'Total account factor', 0.1, 90)
train_data$total_acc_factor_trnsfrmd <- ifelse(train_data$total_acc <= 10, 'total_acc<=10',
ifelse(train_data$total_acc <= 25, 'total_acc<=25',
ifelse(train_data$total_acc <= 50, 'total_acc<=50',
ifelse(train_data$total_acc > 50, 'total_acc>50',
NaN))))
acc_now_delinq
acc_now_delinq_woe <- calculate_weight_of_evidence_continuous(train_data, 'acc_now_delinq', 'good_bad')
## [1] "acc_now_delinq IV is: 0.000114176389107013"
IV_df[nrow(IV_df) + 1,] <- list("acc_now_delinq", distinct(acc_now_delinq_woe, IV))
plot_woe_continuous(acc_now_delinq_woe, 'acc_now_delinq', 'Account now delinquent', 1.0)
train_data$acc_now_delinq_trnsfrmd <- ifelse(train_data$acc_now_delinq == 0, 'acc_now_delinq=0',
ifelse(train_data$acc_now_delinq >= 1, 'acc_now_delinq>=1',
NaN))
total_rev_hi_lim_factor
train_data <- train_data %>%
arrange(total_rev_hi_lim)
train_data$total_rev_hi_lim_factor <- cut(train_data$total_rev_hi_lim, 1000)
total_rev_hi_lim_woe <- calculate_weight_of_evidence_continuous(train_data, 'total_rev_hi_lim_factor', 'good_bad')
## [1] "total_rev_hi_lim_factor IV is: 0.0826053581761765"
IV_df[nrow(IV_df) + 1,] <- list("total_rev_hi_lim_factor", distinct(total_rev_hi_lim_woe, IV))
plot_woe_discrete(total_rev_hi_lim_woe, 'total_rev_hi_lim_factor', 'Total revolving hi limit factor', 1, 90)
train_data$total_rev_hi_lim_factor_trnsfrmd <- ifelse(train_data$total_rev_hi_lim <= 5000, 'total_rev_hi_lim<=5K',
ifelse(train_data$total_rev_hi_lim <= 10000, 'total_rev_hi_lim:5K-10K',
ifelse(train_data$total_rev_hi_lim <= 20000, 'total_rev_hi_lim:10K-20K',
ifelse(train_data$total_rev_hi_lim <= 30000, 'total_rev_hi_lim:20K-30K',
ifelse(train_data$total_rev_hi_lim <= 40000, 'total_rev_hi_lim:30K-40K',
ifelse(train_data$total_rev_hi_lim <= 50000, 'total_rev_hi_lim:40K-50K',
ifelse(train_data$total_rev_hi_lim <= 100000, 'total_rev_hi_lim:50K-100L',
ifelse(train_data$total_rev_hi_lim > 100000, 'total_rev_hi_lim>100K',
NaN))))))))
installment_factor
train_data <- train_data %>% arrange(installment)
train_data$installment_factor <- cut(train_data$installment, 100)
installment_woe <- calculate_weight_of_evidence_continuous(train_data, 'installment_factor', 'good_bad')
## [1] "installment_factor IV is: 0.0282190404686732"
IV_df[nrow(IV_df) + 1,] <- list("installment_factor", distinct(installment_woe, IV))
plot_woe_discrete(installment_woe, 'installment_factor', 'Installment Factor', 0.05, 90)
train_data$installment_factor_trnsfrmd <- ifelse(train_data$installment <= 30, 'installment<=30',
ifelse(train_data$installment <= 100, 'installment:30-100',
ifelse(train_data$installment <= 200, 'installment:100-200',
ifelse(train_data$installment <= 300, 'installment:200-300',
ifelse(train_data$installment <= 400, 'installment:300-400',
ifelse(train_data$installment <= 500, 'installment:400-500',
ifelse(train_data$installment <= 600, 'installment:500:600', ifelse(train_data$installment <= 700, 'installment:600-700',
ifelse(train_data$installment <= 800, 'installment:700-800',
ifelse(train_data$installment > 800, 'installment>800',
NaN))))))))))
annual_inc_factor
train_data <- train_data %>% arrange(annual_inc)
train_data_temp <- train_data %>%
filter(annual_inc <= 140000)
train_data_temp$annual_inc_factor <- cut(train_data_temp$annual_inc, 50)
annual_inc_woe <- calculate_weight_of_evidence_continuous(train_data_temp, 'annual_inc_factor', 'good_bad')
## [1] "annual_inc_factor IV is: 0.0523239023770557"
IV_df[nrow(IV_df) + 1,] <- list("annual_inc_factor", distinct(annual_inc_woe, IV))
plot_woe_discrete(annual_inc_woe, 'annual_inc_factor', 'Annual Income Factor', 0.1, 90)
train_data$annual_inc_factor_trnsfrmd <- ifelse(train_data$annual_inc <= 20000, 'annual_inc<=20K',
ifelse(train_data$annual_inc <= 30000, 'annual_inc:20K-30K',
ifelse(train_data$annual_inc <= 40000, 'annual_inc:30K-40K',
ifelse(train_data$annual_inc <= 50000, 'annual_inc:40K-50K',
ifelse(train_data$annual_inc <= 60000, 'annual_inc:50K-60K',
ifelse(train_data$annual_inc <= 70000, 'annual_inc:60K-70K',
ifelse(train_data$annual_inc <= 80000, 'annual_inc:70K-80K',
ifelse(train_data$annual_inc <= 90000, 'annual_inc:80K-90K',
ifelse(train_data$annual_inc <= 100000, 'annual_inc:90K-100K',
ifelse(train_data$annual_inc <= 120000, 'annual_inc:100K-120K',
ifelse(train_data$annual_inc <= 140000, 'annual_inc:120K-140K',
ifelse(train_data$annual_inc > 140000, 'annual_inc:>140K',
NaN))))))))))))
mths_since_last_delinq_factor
train_data <- train_data %>%
arrange(mths_since_last_delinq)
train_data$mths_since_last_delinq_factor <- cut(train_data$mths_since_last_delinq, 50)
mths_since_last_delinq_woe <- calculate_weight_of_evidence_continuous(train_data, 'mths_since_last_delinq_factor', 'good_bad')
## [1] "mths_since_last_delinq_factor IV is: 0.00477673892483607"
IV_df[nrow(IV_df) + 1,] <- list("mths_since_last_delinq_factor", distinct(mths_since_last_delinq_woe, IV))
plot_woe_discrete(mths_since_last_delinq_woe, 'mths_since_last_delinq_factor', 'Months since last delinquency factor', 0.5, 90)
train_data$mths_since_last_delinq_factor_trnsfrmd <- ifelse(train_data$mths_since_last_delinq <= 3, 'mths_since_last_delinq<=3',
ifelse(train_data$mths_since_last_delinq <= 7, 'mths_since_last_delinq:4-7',
ifelse(train_data$mths_since_last_delinq <= 40, 'mths_since_last_delinq:7-40',
ifelse(train_data$mths_since_last_delinq <= 80, 'mths_since_last_delinq:40-80',
ifelse(train_data$mths_since_last_delinq > 80, 'mths_since_last_delinq>80',
ifelse(is.na(train_data$mths_since_last_delinq), 'Missing',
NaN))))))
train_data <- train_data %>%
arrange(dti)
train_data_temp <- train_data %>%
filter(dti <= 35)
train_data_temp$dti_factor <- cut(train_data_temp$dti, 20) # try with 100 and 50 cuts
train_data <- train_data %>%
left_join(train_data_temp %>% select(id, dti_factor))
dti_woe <- calculate_weight_of_evidence_continuous(train_data_temp, 'dti_factor', 'good_bad')
## [1] "dti_factor IV is: 0.0131144001658101"
IV_df[nrow(IV_df) + 1,] <- list("dti_factor", distinct(dti_woe, IV))
plot_woe_discrete(dti_woe, 'dti_factor', 'Debt to Income', 0.1, 90)
train_data$dti_factor_trnsfrmd <- ifelse(train_data$dti <= 1, 'dti:<1',
ifelse(train_data$dti <= 2, 'dti:1-2',
ifelse(train_data$dti <= 4, 'dti:2-4',
ifelse(train_data$dti <= 6, 'dti:4-6',
ifelse(train_data$dti <= 8, 'dti:6-8',
ifelse(train_data$dti <= 10, 'dti:8-10',
ifelse(train_data$dti <= 15, 'dti:10-15',
ifelse(train_data$dti <= 20, 'dti:15-20',
ifelse(train_data$dti <= 25, 'dti:20-25',
ifelse(train_data$dti <= 30, 'dti:25-30',
ifelse(train_data$dti <= 35, 'dti:30-35',
ifelse(train_data$dti >35, 'dti:>35',
NaN))))))))))))
mths_since_last_record_factor
train_data <- train_data %>%
arrange(mths_since_last_record)
train_data$mths_since_last_record_factor <- cut(train_data$mths_since_last_record, 20)
mths_since_last_record_woe <- calculate_weight_of_evidence_continuous(train_data, 'mths_since_last_record_factor', 'good_bad')
## [1] "mths_since_last_record_factor IV is: 0.0253121582979921"
IV_df[nrow(IV_df) + 1,] <- list("mths_since_last_record_factor", distinct(mths_since_last_record_woe, IV))
plot_woe_discrete(mths_since_last_record_woe, 'mths_since_last_record_factor', 'Months since last record factor', 0.1, 90)
train_data$mths_since_last_record_factor_trnsfrmd <- ifelse(train_data$mths_since_last_record <= 2, 'mths_since_last_record<=2',
ifelse(train_data$mths_since_last_record <= 30, 'mths_since_last_record:3-30',
ifelse(train_data$mths_since_last_record <= 65, 'mths_since_last_record:31-65',
ifelse(train_data$mths_since_last_record <= 80, 'mths_since_last_record:66-80',
ifelse(train_data$mths_since_last_record > 80, 'mths_since_last_record>80',
ifelse(is.na(train_data$mths_since_last_record), 'Missing',
NaN))))))
At the end retain only transformed columns and good/bad flag.
train_data <- train_data %>%
mutate(mths_since_last_record_factor_trnsfrmd = replace_na(mths_since_last_record_factor_trnsfrmd, 'Missing'))
train_data <- train_data %>%
mutate(mths_since_last_delinq_factor_trnsfrmd = replace_na(mths_since_last_delinq_factor_trnsfrmd, 'Missing'))
train_data_trnsfrmd <- train_data %>%
select(ends_with(c('good_bad', 'trnsfrmd')))
Using exactly the same categories, we transform test data for later to measure the performance of our PD model.
test_data$grade_trnsfrmd <- test_data$grade
test_data$home_ownership_trnsfrmd <- ifelse(test_data$home_ownership %in% combine_home_ownership, "RENT_OTHER_NONE_ANY", test_data$home_ownership)
state1 <- c("NV", "NE", "IA", "ME")
state2 <- c("HI", "FL", "AL", "LA")
state3 <- c("NY")
state4 <- c("NC", "MD", "NM", "VA", "NJ", "UT", "MO")
state5 <- c("CA")
state6 <- c("AZ", "AR", "MI")
state7 <- c("OK", "TN", "PA")
state8 <- c("DE", "MA", "RI", "KY", "MN", "OH", "SD", "IN", "OR", "GA", "WA", "ID", "WI", "MT")
state9 <- c("TX")
state10 <- c("VT", "CT", "IL", "AK", "SC", "CO", "KS", "MS")
state11 <- c("NH", "WV", "WY")
state12 <- c("DC")
test_data$addr_state_trnsfrmd <- ifelse(test_data$addr_state %in% state1, "state_list_1",
ifelse(test_data$addr_state %in% state2, "state_list_2",
ifelse(test_data$addr_state %in% state3, "state_list_3",
ifelse(test_data$addr_state %in% state4, "state_list_4",
ifelse(test_data$addr_state %in% state5, "state_list_5",
ifelse(test_data$addr_state %in% state6, "state_list_6",
ifelse(test_data$addr_state %in% state7, "state_list_7",
ifelse(test_data$addr_state %in% state8, "state_list_8",
ifelse(test_data$addr_state %in% state9, "state_list_9",
ifelse(test_data$addr_state %in% state10, "state_list_10",
ifelse(test_data$addr_state %in% state11, "state_list_11",
ifelse(test_data$addr_state %in% state12, "state_list_12",NaN))))))))))))
test_data$verification_status_trnsfrmd <- ifelse(test_data$verification_status %in% c("Not Verified", "Source Verified"),
"Not or Source Verified",
test_data$verification_status)
purpose1 <- c("small_business", "educational", "renewable_energy", "moving")
purpose2 <- c("other", "medical", "house")
purpose3 <- c("debt_consolidation", "vacation", "home_improvement")
purpose4 <- c("wedding", "car", "major_purchase", "credit_card")
test_data$purpose_trnsfrmd <- ifelse(test_data$purpose %in% purpose1, "purpose_list_1",
ifelse(test_data$purpose %in% purpose2, "purpose_list_2",
ifelse(test_data$purpose %in% purpose3, "purpose_list_3",
ifelse(test_data$purpose %in% purpose4, "purpose_list_4", NaN))))
test_data$initial_list_status_trnsfrmd <- test_data$initial_list_status
test_data$term_int_trnsfrmd <- ifelse(test_data$term_int == 36, 'term_36',
ifelse(test_data$term_int == 60, 'term_60', NaN))
test_data$emp_length_int_trnsfrmd <- ifelse(test_data$emp_length_int %in% c(0), 'emp_length_0',
ifelse(test_data$emp_length_int %in% c(1,2,3,4), 'emp_length_1_4',
ifelse(test_data$emp_length_int %in% c(5,6), 'emp_length_5_6',
ifelse(test_data$emp_length_int %in% c(7,8,9), 'emp_length_7_9',
'emp_length_10'))))
test_data$mths_since_issue_d_trnsfrmd <- ifelse(test_data$mths_since_issue_d <= 55, paste0('mths_since_issue_date=',str(test_data$mths_since_issue_d)),
ifelse(test_data$mths_since_issue_d %in% c(56,57,58,59,60,61,62,63), 'mths_since_issue_date>=56<=63',
ifelse(test_data$mths_since_issue_d %in% seq(64,84), 'mths_since_issue_date>63<=84',
ifelse(test_data$mths_since_issue_d > 84, 'mths_since_issue_data>84', NaN))))
## num [1:88737] 71 71 71 71 71 71 71 71 71 71 ...
test_data$int_rate_factor_trnsfrmd <- ifelse(test_data$int_rate <= 9.548, 'int_rate<9.548',
ifelse(test_data$int_rate <= 12.025, '9.548<int_rate<=12.025',
ifelse(test_data$int_rate <= 15.74, '12.025<int_rate<=15.74',
ifelse(test_data$int_rate <= 20.281, '15.74<int_rate<20.281',
ifelse(test_data$int_rate >20.281, 'int_rate>20.281',NaN)))))
test_data$funded_amnt_factor_trnsfrmd <- ifelse(test_data$funded_amnt <= 1000, 'funded_amnt:<5K',
ifelse(test_data$funded_amnt <= 10000, 'funded_amnt:5K-10K',
ifelse(test_data$funded_amnt <= 20000, 'funded_amnt:10K:20K',
ifelse(test_data$funded_amnt <= 30000, 'funded_amnt:20K:30:',
ifelse(test_data$funded_amnt <= 50000, 'funded_amnt:30K:50K',
ifelse(test_data$funded_amnt <= 70000, 'funded_amnt:50K:70K',
ifelse(test_data$funded_amnt <= 100000, 'funded_amnt:70K:100K',
ifelse(test_data$funded_amnt > 100000, 'funded_amnt:>100K',
NaN))))))))
test_data$mths_since_earliest_cr_line_factor_trnsfrmd <- ifelse(test_data$mths_since_earliest_cr_line <= 140, 'mths_since_earliest_cr_line<=140',
ifelse(test_data$mths_since_earliest_cr_line <= 246, '140<mths_since_earliest_cr_line<=246',
ifelse(test_data$mths_since_earliest_cr_line <= 270, '246<mths_since_earliest_cr_line<=270',
ifelse(test_data$mths_since_earliest_cr_line <= 293, '270<mths_since_earliest_cr_line<=293',
ifelse(test_data$mths_since_earliest_cr_line <= 398, '293<mths_since_earliest_cr_line<=398',
ifelse(test_data$mths_since_earliest_cr_line >398, 'mths_since_earliest_cr_line>398',
NaN))))))
test_data$delinq_2yrs_trnsfrmd <- ifelse(test_data$delinq_2yrs == 0, 'delinq_2yrs=0',
ifelse(test_data$delinq_2yrs == 1, 'delinq_2yrs=1',
ifelse(test_data$delinq_2yrs == 2, 'delinq_2yrs=2',
ifelse(test_data$delinq_2yrs > 2, 'delinq_2yrs>2',
NaN))))
test_data$inq_last_6mths_trnsfrmd <- ifelse(test_data$inq_last_6mths == 0, 'inq_last_6mths=0',
ifelse(test_data$inq_last_6mths == 1, 'inq_last_6mths=1',
ifelse(test_data$inq_last_6mths == 2, 'inq_last_6mths=2',
ifelse(test_data$inq_last_6mths == 3, 'inq_last_6mths=3',
ifelse(test_data$inq_last_6mths > 3, 'inq_last_6mths>3',
NaN)))))
test_data$open_acc_trnsfrmd <- ifelse(test_data$open_acc %in% c(0,1,2,3,seq(21,max(test_data$open_acc))), 'open_acc=0,1,2,3,21-max',
ifelse(test_data$open_acc %in% c(4,5,6,7), 'open_acc=4,5,6,7',
ifelse(test_data$open_acc %in% c(8), 'open_acc=8',
ifelse(test_data$open_acc %in% c(9,10,11,12), 'open_acc=9,10,11,12',
ifelse(test_data$open_acc %in% c(13,14,15,16), 'open_acc=13,14,15,16',
ifelse(test_data$open_acc %in% c(17), 'open_acc=17',
ifelse(test_data$open_acc %in% c(18,19,20), 'open_acc=18,19,20',
NaN)))))))
test_data$pub_rec_trnsfrmd <- ifelse(test_data$pub_rec == 0, 'pub_rec=0',
ifelse(test_data$pub_rec == 1, 'pub_rec=1',
ifelse(test_data$pub_rec == 2, 'pub_rec=2',
ifelse(test_data$pub_rec >2, 'pub_rec>=3',
NaN))))
test_data$total_acc_factor_trnsfrmd <- ifelse(test_data$total_acc <= 10, 'total_acc<=10',
ifelse(test_data$total_acc <= 25, 'total_acc<=25',
ifelse(test_data$total_acc <= 50, 'total_acc<=50',
ifelse(test_data$total_acc > 50, 'total_acc>50',
NaN))))
test_data$acc_now_delinq_trnsfrmd <- ifelse(test_data$acc_now_delinq == 0, 'acc_now_delinq=0',
ifelse(test_data$acc_now_delinq >= 1, 'acc_now_delinq>=1',
NaN))
test_data$total_rev_hi_lim_factor_trnsfrmd <- ifelse(test_data$total_rev_hi_lim <= 5000, 'total_rev_hi_lim<=5K',
ifelse(test_data$total_rev_hi_lim <= 10000, 'total_rev_hi_lim:5K-10K',
ifelse(test_data$total_rev_hi_lim <= 20000, 'total_rev_hi_lim:10K-20K',
ifelse(test_data$total_rev_hi_lim <= 30000, 'total_rev_hi_lim:20K-30K',
ifelse(test_data$total_rev_hi_lim <= 40000, 'total_rev_hi_lim:30K-40K',
ifelse(test_data$total_rev_hi_lim <= 50000, 'total_rev_hi_lim:40K-50K',
ifelse(test_data$total_rev_hi_lim <= 100000, 'total_rev_hi_lim:50K-100L',
ifelse(test_data$total_rev_hi_lim > 100000, 'total_rev_hi_lim>100K',
NaN))))))))
test_data$installment_factor_trnsfrmd <- ifelse(test_data$installment <= 30, 'installment<=30',
ifelse(test_data$installment <= 100, 'installment:30-100',
ifelse(test_data$installment <= 200, 'installment:100-200',
ifelse(test_data$installment <= 300, 'installment:200-300',
ifelse(test_data$installment <= 400, 'installment:300-400',
ifelse(test_data$installment <= 500, 'installment:400-500',
ifelse(test_data$installment <= 600, 'installment:500:600',
ifelse(test_data$installment <= 700, 'installment:600-700',
ifelse(test_data$installment <= 800, 'installment:700-800',
ifelse(test_data$installment > 800, 'installment>800',
NaN))))))))))
test_data$annual_inc_factor_trnsfrmd <- ifelse(test_data$annual_inc <= 20000, 'annual_inc<=20K',
ifelse(test_data$annual_inc <= 30000, 'annual_inc:20K-30K',
ifelse(test_data$annual_inc <= 40000, 'annual_inc:30K-40K',
ifelse(test_data$annual_inc <= 50000, 'annual_inc:40K-50K',
ifelse(test_data$annual_inc <= 60000, 'annual_inc:50K-60K',
ifelse(test_data$annual_inc <= 70000, 'annual_inc:60K-70K',
ifelse(test_data$annual_inc <= 80000, 'annual_inc:70K-80K',
ifelse(test_data$annual_inc <= 90000, 'annual_inc:80K-90K',
ifelse(test_data$annual_inc <= 100000, 'annual_inc:90K-100K',
ifelse(test_data$annual_inc <= 120000, 'annual_inc:100K-120K',
ifelse(test_data$annual_inc <= 140000, 'annual_inc:120K-140K',
ifelse(test_data$annual_inc > 140000, 'annual_inc:>140K',
NaN))))))))))))
test_data$mths_since_last_delinq_factor_trnsfrmd <- ifelse(test_data$mths_since_last_delinq <= 3, 'mths_since_last_delinq<=3',
ifelse(test_data$mths_since_last_delinq <= 7, 'mths_since_last_delinq:4-7',
ifelse(test_data$mths_since_last_delinq <= 40, 'mths_since_last_delinq:7-40',
ifelse(test_data$mths_since_last_delinq <= 80, 'mths_since_last_delinq:40-80',
ifelse(test_data$mths_since_last_delinq > 80, 'mths_since_last_delinq>80',
ifelse(is.na(test_data$mths_since_last_delinq), 'Missing',
NaN))))))
test_data$dti_factor_trnsfrmd <- ifelse(test_data$dti <= 1, 'dti:<1',
ifelse(test_data$dti <= 2, 'dti:1-2',
ifelse(test_data$dti <= 4, 'dti:2-4',
ifelse(test_data$dti <= 6, 'dti:4-6',
ifelse(test_data$dti <= 8, 'dti:6-8',
ifelse(test_data$dti <= 10, 'dti:8-10',
ifelse(test_data$dti <= 15, 'dti:10-15',
ifelse(test_data$dti <= 20, 'dti:15-20',
ifelse(test_data$dti <= 25, 'dti:20-25',
ifelse(test_data$dti <= 30, 'dti:25-30',
ifelse(test_data$dti <= 35, 'dti:30-35',
ifelse(test_data$dti >35, 'dti:>35',
NaN))))))))))))
test_data$mths_since_last_record_factor_trnsfrmd <- ifelse(test_data$mths_since_last_record <= 2, 'mths_since_last_record<=2',
ifelse(test_data$mths_since_last_record <= 30, 'mths_since_last_record:3-30',
ifelse(test_data$mths_since_last_record <= 65, 'mths_since_last_record:31-65',
ifelse(test_data$mths_since_last_record <= 80, 'mths_since_last_record:66-80',
ifelse(test_data$mths_since_last_record > 80, 'mths_since_last_record>80',
ifelse(is.na(test_data$mths_since_last_record), 'Missing',
NaN))))))
test_data <- test_data %>%
mutate(mths_since_last_record_factor_trnsfrmd = replace_na(mths_since_last_record_factor_trnsfrmd, 'Missing'))
test_data <- test_data %>%
mutate(mths_since_last_delinq_factor_trnsfrmd = replace_na(mths_since_last_delinq_factor_trnsfrmd, 'Missing'))
test_data_trnsfrmd <- test_data %>%
select(ends_with(c('good_bad', 'trnsfrmd')))
We have finally prepared our train and test data for model fitting, but before we proceed with fitting, we inspect IV table and check for variables that have extremely low IV. These variables then can be removed as they are redundant and have very low or no predictive capability. Following is our table of IV"
library(knitr)
kable(IV_df, caption='Information value table')
WoE_col | IV_value |
---|---|
grade | 0.3221226 |
home_ownership | 0.02284384 |
grade | 0.02284384 |
addr_state | 0.01789869 |
verification_status | 0.03591611 |
purpose | 0.05026488 |
initial_list_status | 0.1531024 |
term_int | 0.02082322 |
emp_length_int | 0.009471146 |
mths_since_issue_d | 0.9180439 |
int_rate_factor | 0.5569444 |
funded_amnt_factor | 0.01916893 |
mths_since_earliest_cr_line | 0.01513249 |
delinq_2yrs | 0.001068474 |
inq_last_mths | 0.09256693 |
open_acc | 0.008568512 |
pub_rec | 0.005286657 |
total_acc_factor | 0.01002985 |
acc_now_delinq | 0.0001141764 |
total_rev_hi_lim_factor | 0.08260536 |
installment_factor | 0.02821904 |
annual_inc_factor | 0.0523239 |
mths_since_last_delinq_factor | 0.004776739 |
dti_factor | 0.0131144 |
mths_since_last_record_factor | 0.02531216 |
Generally, IV values smaller than 0.02 are considered to have no predictive power. However, we will set the bar lower and set IV cutoff of 0.01. This means that any feature that has IV less than 0.01 will be removed from prepared train and test data and hence will not be used for logistic regression. At the same time, lets convert all columns from strings to factors which is de facto datatype to represent categories.
library(stats)
library(broom)
library(tidyverse)
library(dplyr)
library(readxl)
library(stringr)
library(purrr)
library(lubridate)
library(caret)
library(ggplot2)
library(data.table)
train_data_trnsfrmd <- train_data_trnsfrmd %>%
select(-contains(pull(IV_df %>% filter(IV_value < 0.01) %>% select(WoE_col))))
test_data_trnsfrmd <- test_data_trnsfrmd %>%
select(-contains(pull(IV_df %>% filter(IV_value < 0.01) %>% select(WoE_col))))
train_data_trnsfrmd <- train_data_trnsfrmd %>% mutate_if(is.character,as.factor)
We are now all set to fit the logistic regression to our train data and extract parameters for retained features of the data,
model <- glm(good_bad ~.,family=binomial(link='logit'),data=train_data_trnsfrmd)
After cleaning and processing data, following are performance metrics of logistic regression fit:
Misclassification Error is 0.0886
AUROC is
plotROC(test_data_trnsfrmd$good_bad, predicted)
Gini Index, 0.4863805 and Kolmogorov-Smirnov is 0.3628.
Before we can calculate credit score card, we have to prepare model coefficients table as follows,
test <- dummy.coef(model)
score_card_df <- data.frame(matrix(ncol = 4, nrow = 0))
colnames(score_card_df) <- c('col+category', 'col', 'category', 'regression_coeff')
for (i in seq_along(test)){
for (j in seq_along(test[[i]])){
col_category <- paste0(names(test[i]), names(test[[i]][j]))
correspndng_col <- names(test[i])
correspndng_category <- names(test[[i]][j])
reg_coeff <- test[[i]][[j]]
score_card_df[nrow(score_card_df) + 1,] = list(col_category, correspndng_col, correspndng_category, reg_coeff)
}
}
p_values_df <- stack(coef(summary(model))[,4])
coeff_df <- stack(coefficients(model))
col_names <- colnames(train_data_trnsfrmd)
We are ready to calculate score card now,
max_sum_coef <- sum(pull(score_card_df %>%
group_by(col) %>%
summarise(regression_coeff = max(regression_coeff)) %>%
select(regression_coeff)))
min_sum_coef <- sum(pull(score_card_df %>%
group_by(col) %>%
summarise(regression_coeff = min(regression_coeff)) %>%
select(regression_coeff)))
max_score <- 850
min_score <- 300
score_card_df <- score_card_df %>%
mutate(score_calc = regression_coeff * (max_score - min_score) / (max_sum_coef - min_sum_coef)) %>%
mutate(score_calc = ifelse(category == '(Intercept)',
(regression_coeff - min_sum_coef)/(max_sum_coef - min_sum_coef) *
(max_score - min_score) + min_score, score_calc)) %>%
mutate(score_preliminary = round(score_calc))
max_sum_score_prelimnry <- sum(pull(score_card_df %>%
group_by(col) %>%
summarise(score_preliminary = max(score_preliminary)) %>%
select(score_preliminary)))
min_sum_score_prelimnry <- sum(pull(score_card_df %>%
group_by(col) %>%
summarise(score_preliminary = min(score_preliminary)) %>%
select(score_preliminary)))
We also calculate credit score for each customer, based on the score card calculated,
model1 <- model
score_coeff <- score_card_df %>%
filter(regression_coeff != 0) %>%
select(col, score_preliminary)
cat_names <- pull(score_coeff, col)
score_coeff <- pull(score_coeff, score_preliminary)
names(score_coeff) <- cat_names
model1$coefficients <- score_coeff
train_data_trnsfrmd <- train_data_trnsfrmd %>%
mutate(credit_score = predict(model1, train_data_trnsfrmd))
test_data_trnsfrmd <- test_data_trnsfrmd %>%
mutate(credit_score = predict(model1, test_data_trnsfrmd))
score_card_final <- score_card_df %>%
select(col, category, regression_coeff, score_preliminary)
colnames(score_card_final) <- c('Feature', 'Category', 'Regression Coefficient', 'Credit Score')
So following table is score card:
kable(score_card_final, caption='Score Card')
Feature | Category | Regression Coefficient | Credit Score |
---|---|---|---|
(Intercept) | (Intercept) | 4.8751718 | 620 |
grade_trnsfrmd | A | 0.0000000 | 0 |
grade_trnsfrmd | B | 0.3562979 | 14 |
grade_trnsfrmd | C | 0.5832382 | 23 |
grade_trnsfrmd | D | 0.6986421 | 28 |
grade_trnsfrmd | E | 0.9067396 | 36 |
grade_trnsfrmd | F | 0.9546764 | 38 |
grade_trnsfrmd | G | 0.7543712 | 30 |
home_ownership_trnsfrmd | MORTGAGE | 0.0000000 | 0 |
home_ownership_trnsfrmd | OWN | 0.0069895 | 0 |
home_ownership_trnsfrmd | RENT_OTHER_NONE_ANY | -0.1187902 | -5 |
addr_state_trnsfrmd | NaN | 0.0000000 | 0 |
addr_state_trnsfrmd | state_list_1 | -3.1855832 | -127 |
addr_state_trnsfrmd | state_list_10 | -2.8422071 | -113 |
addr_state_trnsfrmd | state_list_11 | -2.7436093 | -109 |
addr_state_trnsfrmd | state_list_12 | -2.6018673 | -104 |
addr_state_trnsfrmd | state_list_2 | -3.1401084 | -125 |
addr_state_trnsfrmd | state_list_3 | -3.1279773 | -125 |
addr_state_trnsfrmd | state_list_4 | -3.0983589 | -123 |
addr_state_trnsfrmd | state_list_5 | -3.1058598 | -124 |
addr_state_trnsfrmd | state_list_6 | -3.0375841 | -121 |
addr_state_trnsfrmd | state_list_7 | -3.0309108 | -121 |
addr_state_trnsfrmd | state_list_8 | -2.9953680 | -119 |
addr_state_trnsfrmd | state_list_9 | -2.9182484 | -116 |
verification_status_trnsfrmd | Not or Source Verified | 0.0000000 | 0 |
verification_status_trnsfrmd | Verified | -0.0629930 | -3 |
purpose_trnsfrmd | purpose_list_1 | 0.0000000 | 0 |
purpose_trnsfrmd | purpose_list_2 | 0.3056351 | 12 |
purpose_trnsfrmd | purpose_list_3 | 0.3466141 | 14 |
purpose_trnsfrmd | purpose_list_4 | 0.4280740 | 17 |
initial_list_status_trnsfrmd | f | 0.0000000 | 0 |
initial_list_status_trnsfrmd | w | 0.3485581 | 14 |
term_int_trnsfrmd | term_36 | 0.0000000 | 0 |
term_int_trnsfrmd | term_60 | -0.0270510 | -1 |
mths_since_issue_d_trnsfrmd | mths_since_issue_data>84 | 0.0000000 | 0 |
mths_since_issue_d_trnsfrmd | mths_since_issue_date= | 1.0273177 | 41 |
mths_since_issue_d_trnsfrmd | mths_since_issue_date>=56<=63 | 0.3240516 | 13 |
mths_since_issue_d_trnsfrmd | mths_since_issue_date>63<=84 | 0.0377305 | 2 |
int_rate_factor_trnsfrmd | 12.025<int_rate<=15.74 | 0.0000000 | 0 |
int_rate_factor_trnsfrmd | 15.74<int_rate<20.281 | -0.6821086 | -27 |
int_rate_factor_trnsfrmd | 9.548<int_rate<=12.025 | 0.6362081 | 25 |
int_rate_factor_trnsfrmd | int_rate<9.548 | 1.5881446 | 63 |
int_rate_factor_trnsfrmd | int_rate>20.281 | -1.3516344 | -54 |
funded_amnt_factor_trnsfrmd | funded_amnt:<5K | 0.0000000 | 0 |
funded_amnt_factor_trnsfrmd | funded_amnt:10K:20K | -0.1309269 | -5 |
funded_amnt_factor_trnsfrmd | funded_amnt:20K:30: | -0.0760804 | -3 |
funded_amnt_factor_trnsfrmd | funded_amnt:30K:50K | 0.0058769 | 0 |
funded_amnt_factor_trnsfrmd | funded_amnt:5K-10K | -0.2454671 | -10 |
mths_since_earliest_cr_line_factor_trnsfrmd | 140<mths_since_earliest_cr_line<=246 | 0.0000000 | 0 |
mths_since_earliest_cr_line_factor_trnsfrmd | 246<mths_since_earliest_cr_line<=270 | -0.0118017 | 0 |
mths_since_earliest_cr_line_factor_trnsfrmd | 270<mths_since_earliest_cr_line<=293 | 0.0182333 | 1 |
mths_since_earliest_cr_line_factor_trnsfrmd | 293<mths_since_earliest_cr_line<=398 | 0.0118682 | 0 |
mths_since_earliest_cr_line_factor_trnsfrmd | mths_since_earliest_cr_line<=140 | 0.0871624 | 3 |
mths_since_earliest_cr_line_factor_trnsfrmd | mths_since_earliest_cr_line>398 | -0.0373978 | -1 |
inq_last_6mths_trnsfrmd | inq_last_6mths=0 | 0.0000000 | 0 |
inq_last_6mths_trnsfrmd | inq_last_6mths=1 | -0.1528066 | -6 |
inq_last_6mths_trnsfrmd | inq_last_6mths=2 | -0.2981309 | -12 |
inq_last_6mths_trnsfrmd | inq_last_6mths=3 | -0.4337185 | -17 |
inq_last_6mths_trnsfrmd | inq_last_6mths>3 | -0.5711087 | -23 |
total_acc_factor_trnsfrmd | total_acc<=10 | 0.0000000 | 0 |
total_acc_factor_trnsfrmd | total_acc<=25 | -0.1011973 | -4 |
total_acc_factor_trnsfrmd | total_acc<=50 | -0.1370747 | -5 |
total_acc_factor_trnsfrmd | total_acc>50 | -0.1078410 | -4 |
total_rev_hi_lim_factor_trnsfrmd | total_rev_hi_lim:10K-20K | 0.0000000 | 0 |
total_rev_hi_lim_factor_trnsfrmd | total_rev_hi_lim:20K-30K | 0.0502214 | 2 |
total_rev_hi_lim_factor_trnsfrmd | total_rev_hi_lim:30K-40K | 0.0564718 | 2 |
total_rev_hi_lim_factor_trnsfrmd | total_rev_hi_lim:40K-50K | 0.1088412 | 4 |
total_rev_hi_lim_factor_trnsfrmd | total_rev_hi_lim:50K-100L | 0.1303267 | 5 |
total_rev_hi_lim_factor_trnsfrmd | total_rev_hi_lim:5K-10K | -0.0345617 | -1 |
total_rev_hi_lim_factor_trnsfrmd | total_rev_hi_lim<=5K | -0.1350045 | -5 |
total_rev_hi_lim_factor_trnsfrmd | total_rev_hi_lim>100K | 0.3046205 | 12 |
installment_factor_trnsfrmd | installment:100-200 | 0.0000000 | 0 |
installment_factor_trnsfrmd | installment:200-300 | -0.1212032 | -5 |
installment_factor_trnsfrmd | installment:30-100 | 0.1448740 | 6 |
installment_factor_trnsfrmd | installment:300-400 | -0.2405831 | -10 |
installment_factor_trnsfrmd | installment:400-500 | -0.3412615 | -14 |
installment_factor_trnsfrmd | installment:500:600 | -0.4149059 | -17 |
installment_factor_trnsfrmd | installment:600-700 | -0.4346135 | -17 |
installment_factor_trnsfrmd | installment:700-800 | -0.4475585 | -18 |
installment_factor_trnsfrmd | installment<=30 | -0.2536697 | -10 |
installment_factor_trnsfrmd | installment>800 | -0.5732664 | -23 |
annual_inc_factor_trnsfrmd | annual_inc:>140K | 0.0000000 | 0 |
annual_inc_factor_trnsfrmd | annual_inc:100K-120K | -0.0719114 | -3 |
annual_inc_factor_trnsfrmd | annual_inc:120K-140K | 0.0097124 | 0 |
annual_inc_factor_trnsfrmd | annual_inc:20K-30K | -0.7327546 | -29 |
annual_inc_factor_trnsfrmd | annual_inc:30K-40K | -0.6519481 | -26 |
annual_inc_factor_trnsfrmd | annual_inc:40K-50K | -0.5142317 | -20 |
annual_inc_factor_trnsfrmd | annual_inc:50K-60K | -0.4453559 | -18 |
annual_inc_factor_trnsfrmd | annual_inc:60K-70K | -0.3208593 | -13 |
annual_inc_factor_trnsfrmd | annual_inc:70K-80K | -0.2675392 | -11 |
annual_inc_factor_trnsfrmd | annual_inc:80K-90K | -0.1755016 | -7 |
annual_inc_factor_trnsfrmd | annual_inc:90K-100K | -0.1508294 | -6 |
annual_inc_factor_trnsfrmd | annual_inc<=20K | -0.7749635 | -31 |
dti_factor_trnsfrmd | dti:<1 | 0.0000000 | 0 |
dti_factor_trnsfrmd | dti:>35 | 0.4639364 | 18 |
dti_factor_trnsfrmd | dti:1-2 | -0.0313432 | -1 |
dti_factor_trnsfrmd | dti:10-15 | 0.0216819 | 1 |
dti_factor_trnsfrmd | dti:15-20 | -0.0232163 | -1 |
dti_factor_trnsfrmd | dti:2-4 | 0.0662698 | 3 |
dti_factor_trnsfrmd | dti:20-25 | -0.0646567 | -3 |
dti_factor_trnsfrmd | dti:25-30 | -0.1112251 | -4 |
dti_factor_trnsfrmd | dti:30-35 | 0.0325305 | 1 |
dti_factor_trnsfrmd | dti:4-6 | 0.1200488 | 5 |
dti_factor_trnsfrmd | dti:6-8 | 0.1163105 | 5 |
dti_factor_trnsfrmd | dti:8-10 | 0.0667734 | 3 |
mths_since_last_record_factor_trnsfrmd | Missing | 0.0000000 | 0 |
mths_since_last_record_factor_trnsfrmd | mths_since_last_record:3-30 | 0.2355819 | 9 |
mths_since_last_record_factor_trnsfrmd | mths_since_last_record:31-65 | 0.4139568 | 16 |
mths_since_last_record_factor_trnsfrmd | mths_since_last_record:66-80 | 0.3945855 | 16 |
mths_since_last_record_factor_trnsfrmd | mths_since_last_record<=2 | -0.6925020 | -28 |
mths_since_last_record_factor_trnsfrmd | mths_since_last_record>80 | -0.1467817 | -6 |
For monitoring we first start with transforming new data the same as we did for train and test data.
new_data <- fread("C:\\Asus WebStorage\\nitin.7785@gmail.com\\MySyncFolder\\Credit_Risk_Modelling\\Credit-Risk-Model\\publish_git\\data\\lc_2016_2017.csv")
# General Preprocessing
# remove columns with all NA's
new_data <- new_data %>%
select_if(~sum(!is.na(.)) > 0)
# convert emp_length column to integers, replace "< 1 year" with 0 and replace NA's with 0
new_data <- new_data %>%
mutate(emp_length_int = gsub("< 1 year", "0", emp_length))
new_data$emp_length_int <- map_chr(new_data$emp_length_int, numextract) %>%
as.double() %>%
replace_na(0)
# convert term_int column to integers
new_data <- new_data %>%
mutate(term_int = ifelse(term == '36 months', 36, 60))
distinct(new_data, term_int)
# convert mths_since_earliest_cr_line and mths_since_earliest_cr_line, to date and calculate
# months upto 01-12-2017
new_data$mths_since_earliest_cr_line <- interval(parse_date_time(new_data$earliest_cr_line, "by"),
as.POSIXct("2017-12-01"))%/% months(1)
new_data$mths_since_earliest_cr_line <- ifelse(new_data$mths_since_earliest_cr_line < 0,
max(new_data$mths_since_earliest_cr_line, na.rm = TRUE),
new_data$mths_since_earliest_cr_line)
new_data$mths_since_issue_d <- interval(parse_date_time(new_data$issue_d, "by"),
as.POSIXct("2017-12-01"))%/% months(1)
# preprocessing discrete variables
# since we do not need to create dummy variables for linear regression, we simply note down the discrete variable names
# grade, sub_grade, home_ownership, verification_status, loan_status, purpose, addr_state, initial_list_status
# check for missing values and clean
data_summary = summary(new_data)
# 'Total revolving high credit/ credit limit', so it makes sense that the missing values are equal to funded_amnt.
new_data$total_rev_hi_lim <- ifelse(is.na(new_data$total_rev_hi_lim),
new_data$funded_amnt, new_data$total_rev_hi_lim)
# missing annual income is replace by mean annual income
new_data$annual_inc <- ifelse(is.na(new_data$annual_inc),
mean(new_data$annual_inc, na.rm = TRUE), new_data$annual_inc)
# for the rest fill Na with 0
new_data$mths_since_earliest_cr_line <- ifelse(is.na(new_data$mths_since_earliest_cr_line),
0, new_data$mths_since_earliest_cr_line)
new_data$acc_now_delinq <- ifelse(is.na(new_data$acc_now_delinq), 0, new_data$acc_now_delinq)
new_data$total_acc <- ifelse(is.na(new_data$total_acc), 0, new_data$total_acc)
new_data$pub_rec <- ifelse(is.na(new_data$pub_rec), 0, new_data$pub_rec)
new_data$open_acc <- ifelse(is.na(new_data$open_acc), 0, new_data$open_acc)
new_data$inq_last_6mths <- ifelse(is.na(new_data$inq_last_6mths), 0, new_data$inq_last_6mths)
new_data$delinq_2yrs <- ifelse(is.na(new_data$delinq_2yrs), 0, new_data$delinq_2yrs)
new_data$emp_length_int <- ifelse(is.na(new_data$emp_length_int), 0, new_data$emp_length_int)
# remove unwanted columns
keep_columns <- c("id", "grade",
"home_ownership", "loan_status", "addr_state",
"verification_status", "purpose",
"initial_list_status", "term_int",
"emp_length_int", "mths_since_issue_d",
"int_rate", "funded_amnt",
"mths_since_earliest_cr_line", "delinq_2yrs",
"inq_last_6mths", "open_acc",
"pub_rec", "total_acc",
"acc_now_delinq", "total_rev_hi_lim",
"installment", "annual_inc",
"mths_since_last_delinq", "dti",
"mths_since_last_record")
new_data <- new_data %>%
select(keep_columns)
#### PD model
# Data preparation
# Dependent variable, good/bad (default) definition
# check each loan_status frequency
# create good_bad (default or not) column from loan_status and observe the frequency of good/bad news
bad_status <- c('Charged Off', 'Default',
'Does not meet the credit policy. Status:Charged Off',
'Late (31-120 days)')
new_data$good_bad <- ifelse(new_data$loan_status %in% bad_status, 0, 1)
new_data$grade_trnsfrmd <- new_data$grade
new_data$home_ownership_trnsfrmd <- ifelse(new_data$home_ownership %in% combine_home_ownership, "RENT_OTHER_NONE_ANY", new_data$home_ownership)
state1 <- c("NV", "NE", "IA", "ME")
state2 <- c("HI", "FL", "AL", "LA")
state3 <- c("NY")
state4 <- c("NC", "MD", "NM", "VA", "NJ", "UT", "MO")
state5 <- c("CA")
state6 <- c("AZ", "AR", "MI")
state7 <- c("OK", "TN", "PA")
state8 <- c("DE", "MA", "RI", "KY", "MN", "OH", "SD", "IN", "OR", "GA", "WA", "ID", "WI", "MT")
state9 <- c("TX")
state10 <- c("VT", "CT", "IL", "AK", "SC", "CO", "KS", "MS")
state11 <- c("NH", "WV", "WY")
state12 <- c("DC")
new_data$addr_state_trnsfrmd <- ifelse(new_data$addr_state %in% state1, "state_list_1",
ifelse(new_data$addr_state %in% state2, "state_list_2",
ifelse(new_data$addr_state %in% state3, "state_list_3",
ifelse(new_data$addr_state %in% state4, "state_list_4",
ifelse(new_data$addr_state %in% state5, "state_list_5",
ifelse(new_data$addr_state %in% state6, "state_list_6",
ifelse(new_data$addr_state %in% state7, "state_list_7",
ifelse(new_data$addr_state %in% state8, "state_list_8",
ifelse(new_data$addr_state %in% state9, "state_list_9",
ifelse(new_data$addr_state %in% state10, "state_list_10",
ifelse(new_data$addr_state %in% state11, "state_list_11",
ifelse(new_data$addr_state %in% state12, "state_list_12",NaN))))))))))))
new_data$verification_status_trnsfrmd <- ifelse(new_data$verification_status %in% c("Not Verified", "Source Verified"),
"Not or Source Verified",
new_data$verification_status)
purpose1 <- c("small_business", "educational", "renewable_energy", "moving")
purpose2 <- c("other", "medical", "house")
purpose3 <- c("debt_consolidation", "vacation", "home_improvement")
purpose4 <- c("wedding", "car", "major_purchase", "credit_card")
new_data$purpose_trnsfrmd <- ifelse(new_data$purpose %in% purpose1, "purpose_list_1",
ifelse(new_data$purpose %in% purpose2, "purpose_list_2",
ifelse(new_data$purpose %in% purpose3, "purpose_list_3",
ifelse(new_data$purpose %in% purpose4, "purpose_list_4", NaN))))
new_data$initial_list_status_trnsfrmd <- new_data$initial_list_status
new_data$term_int_trnsfrmd <- ifelse(new_data$term_int == 36, 'term_36',
ifelse(new_data$term_int == 60, 'term_60', NaN))
new_data$emp_length_int_trnsfrmd <- ifelse(new_data$emp_length_int %in% c(0), 'emp_length_0',
ifelse(new_data$emp_length_int %in% c(1,2,3,4), 'emp_length_1_4',
ifelse(new_data$emp_length_int %in% c(5,6), 'emp_length_5_6',
ifelse(new_data$emp_length_int %in% c(7,8,9), 'emp_length_7_9',
'emp_length_10'))))
new_data$mths_since_issue_d_trnsfrmd <- ifelse(new_data$mths_since_issue_d <= 55, paste0('mths_since_issue_date=',str(new_data$mths_since_issue_d)),
ifelse(new_data$mths_since_issue_d %in% c(56,57,58,59,60,61,62,63), 'mths_since_issue_date>=56<=63',
ifelse(new_data$mths_since_issue_d %in% seq(64,84), 'mths_since_issue_date>63<=84',
ifelse(new_data$mths_since_issue_d > 84, 'mths_since_issue_data>84', NaN))))
## num [1:759338] 5 5 5 5 5 5 5 5 5 5 ...
new_data$int_rate_factor_trnsfrmd <- ifelse(new_data$int_rate <= 9.548, 'int_rate<9.548',
ifelse(new_data$int_rate <= 12.025, '9.548<int_rate<=12.025',
ifelse(new_data$int_rate <= 15.74, '12.025<int_rate<=15.74',
ifelse(new_data$int_rate <= 20.281, '15.74<int_rate<20.281',
ifelse(new_data$int_rate >20.281, 'int_rate>20.281',NaN)))))
new_data$funded_amnt_factor_trnsfrmd <- ifelse(new_data$funded_amnt <= 1000, 'funded_amnt:<5K',
ifelse(new_data$funded_amnt <= 10000, 'funded_amnt:5K-10K',
ifelse(new_data$funded_amnt <= 20000, 'funded_amnt:10K:20K',
ifelse(new_data$funded_amnt <= 30000, 'funded_amnt:20K:30:',
ifelse(new_data$funded_amnt <= 50000, 'funded_amnt:30K:50K',
ifelse(new_data$funded_amnt <= 70000, 'funded_amnt:50K:70K',
ifelse(new_data$funded_amnt <= 100000, 'funded_amnt:70K:100K',
ifelse(new_data$funded_amnt > 100000, 'funded_amnt:>100K',
NaN))))))))
new_data$mths_since_earliest_cr_line_factor_trnsfrmd <- ifelse(new_data$mths_since_earliest_cr_line <= 140, 'mths_since_earliest_cr_line<=140',
ifelse(new_data$mths_since_earliest_cr_line <= 246, '140<mths_since_earliest_cr_line<=246',
ifelse(new_data$mths_since_earliest_cr_line <= 270, '246<mths_since_earliest_cr_line<=270',
ifelse(new_data$mths_since_earliest_cr_line <= 293, '270<mths_since_earliest_cr_line<=293',
ifelse(new_data$mths_since_earliest_cr_line <= 398, '293<mths_since_earliest_cr_line<=398',
ifelse(new_data$mths_since_earliest_cr_line >398, 'mths_since_earliest_cr_line>398',
NaN))))))
new_data$delinq_2yrs_trnsfrmd <- ifelse(new_data$delinq_2yrs == 0, 'delinq_2yrs=0',
ifelse(new_data$delinq_2yrs == 1, 'delinq_2yrs=1',
ifelse(new_data$delinq_2yrs == 2, 'delinq_2yrs=2',
ifelse(new_data$delinq_2yrs > 2, 'delinq_2yrs>2',
NaN))))
new_data$inq_last_6mths_trnsfrmd <- ifelse(new_data$inq_last_6mths == 0, 'inq_last_6mths=0',
ifelse(new_data$inq_last_6mths == 1, 'inq_last_6mths=1',
ifelse(new_data$inq_last_6mths == 2, 'inq_last_6mths=2',
ifelse(new_data$inq_last_6mths == 3, 'inq_last_6mths=3',
ifelse(new_data$inq_last_6mths > 3, 'inq_last_6mths>3',
NaN)))))
new_data$open_acc_trnsfrmd <- ifelse(new_data$open_acc %in% c(0,1,2,3,seq(21,max(new_data$open_acc))), 'open_acc=0,1,2,3,21-max',
ifelse(new_data$open_acc %in% c(4,5,6,7), 'open_acc=4,5,6,7',
ifelse(new_data$open_acc %in% c(8), 'open_acc=8',
ifelse(new_data$open_acc %in% c(9,10,11,12), 'open_acc=9,10,11,12',
ifelse(new_data$open_acc %in% c(13,14,15,16), 'open_acc=13,14,15,16',
ifelse(new_data$open_acc %in% c(17), 'open_acc=17',
ifelse(new_data$open_acc %in% c(18,19,20), 'open_acc=18,19,20',
NaN)))))))
new_data$pub_rec_trnsfrmd <- ifelse(new_data$pub_rec == 0, 'pub_rec=0',
ifelse(new_data$pub_rec == 1, 'pub_rec=1',
ifelse(new_data$pub_rec == 2, 'pub_rec=2',
ifelse(new_data$pub_rec >2, 'pub_rec>=3',
NaN))))
new_data$total_acc_factor_trnsfrmd <- ifelse(new_data$total_acc <= 10, 'total_acc<=10',
ifelse(new_data$total_acc <= 25, 'total_acc<=25',
ifelse(new_data$total_acc <= 50, 'total_acc<=50',
ifelse(new_data$total_acc > 50, 'total_acc>50',
NaN))))
new_data$acc_now_delinq_trnsfrmd <- ifelse(new_data$acc_now_delinq == 0, 'acc_now_delinq=0',
ifelse(new_data$acc_now_delinq >= 1, 'acc_now_delinq>=1',
NaN))
new_data$total_rev_hi_lim_factor_trnsfrmd <- ifelse(new_data$total_rev_hi_lim <= 5000, 'total_rev_hi_lim<=5K',
ifelse(new_data$total_rev_hi_lim <= 10000, 'total_rev_hi_lim:5K-10K',
ifelse(new_data$total_rev_hi_lim <= 20000, 'total_rev_hi_lim:10K-20K',
ifelse(new_data$total_rev_hi_lim <= 30000, 'total_rev_hi_lim:20K-30K',
ifelse(new_data$total_rev_hi_lim <= 40000, 'total_rev_hi_lim:30K-40K',
ifelse(new_data$total_rev_hi_lim <= 50000, 'total_rev_hi_lim:40K-50K',
ifelse(new_data$total_rev_hi_lim <= 100000, 'total_rev_hi_lim:50K-100L',
ifelse(new_data$total_rev_hi_lim > 100000, 'total_rev_hi_lim>100K',
NaN))))))))
new_data$installment_factor_trnsfrmd <- ifelse(new_data$installment <= 30, 'installment<=30',
ifelse(new_data$installment <= 100, 'installment:30-100',
ifelse(new_data$installment <= 200, 'installment:100-200',
ifelse(new_data$installment <= 300, 'installment:200-300',
ifelse(new_data$installment <= 400, 'installment:300-400',
ifelse(new_data$installment <= 500, 'installment:400-500',
ifelse(new_data$installment <= 600, 'installment:500:600',
ifelse(new_data$installment <= 700, 'installment:600-700',
ifelse(new_data$installment <= 800, 'installment:700-800',
ifelse(new_data$installment > 800, 'installment>800',
NaN))))))))))
new_data$annual_inc_factor_trnsfrmd <- ifelse(new_data$annual_inc <= 20000, 'annual_inc<=20K',
ifelse(new_data$annual_inc <= 30000, 'annual_inc:20K-30K',
ifelse(new_data$annual_inc <= 40000, 'annual_inc:30K-40K',
ifelse(new_data$annual_inc <= 50000, 'annual_inc:40K-50K',
ifelse(new_data$annual_inc <= 60000, 'annual_inc:50K-60K',
ifelse(new_data$annual_inc <= 70000, 'annual_inc:60K-70K',
ifelse(new_data$annual_inc <= 80000, 'annual_inc:70K-80K',
ifelse(new_data$annual_inc <= 90000, 'annual_inc:80K-90K',
ifelse(new_data$annual_inc <= 100000, 'annual_inc:90K-100K',
ifelse(new_data$annual_inc <= 120000, 'annual_inc:100K-120K',
ifelse(new_data$annual_inc <= 140000, 'annual_inc:120K-140K',
ifelse(new_data$annual_inc > 140000, 'annual_inc:>140K',
NaN))))))))))))
new_data$mths_since_last_delinq_factor_trnsfrmd <- ifelse(new_data$mths_since_last_delinq <= 3, 'mths_since_last_delinq<=3',
ifelse(new_data$mths_since_last_delinq <= 7, 'mths_since_last_delinq:4-7',
ifelse(new_data$mths_since_last_delinq <= 40, 'mths_since_last_delinq:7-40',
ifelse(new_data$mths_since_last_delinq <= 80, 'mths_since_last_delinq:40-80',
ifelse(new_data$mths_since_last_delinq > 80, 'mths_since_last_delinq>80',
ifelse(is.na(new_data$mths_since_last_delinq), 'Missing',
NaN))))))
new_data$dti_factor_trnsfrmd <- ifelse(new_data$dti <= 1, 'dti:<1',
ifelse(new_data$dti <= 2, 'dti:1-2',
ifelse(new_data$dti <= 4, 'dti:2-4',
ifelse(new_data$dti <= 6, 'dti:4-6',
ifelse(new_data$dti <= 8, 'dti:6-8',
ifelse(new_data$dti <= 10, 'dti:8-10',
ifelse(new_data$dti <= 15, 'dti:10-15',
ifelse(new_data$dti <= 20, 'dti:15-20',
ifelse(new_data$dti <= 25, 'dti:20-25',
ifelse(new_data$dti <= 30, 'dti:25-30',
ifelse(new_data$dti <= 35, 'dti:30-35',
ifelse(new_data$dti >35, 'dti:>35',
NaN))))))))))))
new_data$mths_since_last_record_factor_trnsfrmd <- ifelse(new_data$mths_since_last_record <= 2, 'mths_since_last_record<=2',
ifelse(new_data$mths_since_last_record <= 30, 'mths_since_last_record:3-30',
ifelse(new_data$mths_since_last_record <= 65, 'mths_since_last_record:31-65',
ifelse(new_data$mths_since_last_record <= 80, 'mths_since_last_record:66-80',
ifelse(new_data$mths_since_last_record > 80, 'mths_since_last_record>80',
ifelse(is.na(new_data$mths_since_last_record), 'Missing',
NaN))))))
new_data <- new_data %>%
mutate(mths_since_last_record_factor_trnsfrmd = replace_na(mths_since_last_record_factor_trnsfrmd, 'Missing'))
new_data <- new_data %>%
mutate(mths_since_last_delinq_factor_trnsfrmd = replace_na(mths_since_last_delinq_factor_trnsfrmd, 'Missing'))
new_data_trnsfrmd <- new_data %>%
select(ends_with(c('good_bad', 'trnsfrmd')))
Looking at the AUROC, we see that we have a model that predicts probability of default and credit score to an acceptable level of satisfaction. However, we need to remember that as the new data arrives our current model may grow out-of-date, as it has not been trained on new data which might be substantially different from the old data. In order to assess if we need to update our model, we need to quantify how different new data is from old data, on which model was trained. We do this by “Population Stability Index” (PSI) which tells us how much each feature of the new data is different from old data. To calculate PSI, we first compute proportion of each category in each feature from new and old data,
# categorizing credit scores
new_data_trnsfrmd <- new_data_trnsfrmd %>%
mutate(credit_score = predict(model1, new_data_trnsfrmd))
new_data_trnsfrmd <- new_data_trnsfrmd %>%
select(-contains(pull(IV_df %>% filter(IV_value < 0.01) %>% select(WoE_col))))
train_data_trnsfrmd$score_cat <- cut(train_data_trnsfrmd$credit_score, seq(300,850,50))
new_data_trnsfrmd$score_cat <- cut(new_data_trnsfrmd$credit_score, seq(300,850,50))
# calculate proportions of each category in each feature in train data
num_features <- length(colnames(train_data_trnsfrmd)) - 2
col_names <- colnames(train_data_trnsfrmd)
train_categories_freq <- data.frame()
# colnames(train_categories_freq) <- c('categories', 'freq', 'column_name')
for (col_index in c(2:(num_features), num_features + 2)){
test <- train_data_trnsfrmd[, ..col_index] %>%
group_by_all() %>%
summarise(freq_train = n()) %>%
mutate(column_name = col_names[col_index]) %>%
rename(categories = 1)
train_categories_freq <- train_categories_freq %>%
bind_rows(test)
}
# calculate proportions of each category in each feature in new data
num_features <- length(colnames(new_data_trnsfrmd)) - 2
col_names <- colnames(new_data_trnsfrmd)
new_categories_freq <- data.frame()
for (col_index in c(2:(num_features), num_features + 2)){
test <- new_data_trnsfrmd[, ..col_index] %>%
group_by_all() %>%
summarise(freq_new = n()) %>%
mutate(column_name = col_names[col_index]) %>%
rename(categories = 1)
new_categories_freq <- new_categories_freq %>%
bind_rows(test)
}
PSI_calc <- train_categories_freq %>%
left_join(new_categories_freq) %>%
mutate(freq_train = freq_train / nrow(train_data_trnsfrmd),
freq_new = freq_new / nrow(new_data_trnsfrmd)) %>%
ungroup() %>%
mutate(freq_train = ifelse(is.na(freq_train), 0, freq_train),
freq_new = ifelse(is.na(freq_new), 0, freq_new))
Finally we can join new and old data categories frequencies and calculate PSI per feature,
freq_table <- PSI_calc <- PSI_calc %>%
mutate(contribution = ifelse(freq_new == 0 | freq_train == 0, 0,
(freq_new-freq_train)*log(freq_new/freq_train)))
PSI_calc <- PSI_calc %>%
mutate(contribution = ifelse(freq_new == 0 | freq_train == 0, 0,
(freq_new-freq_train)*log(freq_new/freq_train))) %>%
group_by(column_name) %>%
summarise(contribution = sum(contribution)) %>%
mutate(column_name = str_replace(column_name, "_trnsfrmd", ""))
colnames(PSI_calc) <- c('Feature', 'PSI Contribution')
kable(PSI_calc, caption='Population Stability Index Table')
Feature | PSI Contribution |
---|---|
addr_state | 0.0084278 |
annual_inc_factor | 0.0102393 |
dti_factor | 0.0049768 |
funded_amnt_factor | 0.0158914 |
grade | 0.0256307 |
home_ownership | 0.0042154 |
initial_list_status | 0.3472448 |
inq_last_6mths | 0.0286351 |
installment_factor | 0.0209305 |
int_rate_factor | 0.0557600 |
mths_since_earliest_cr_line_factor | 0.0859222 |
mths_since_issue_d | 0.0192241 |
mths_since_last_record_factor | 0.0123267 |
purpose | 0.0089714 |
score_cat | 0.1488427 |
term_int | 0.0076333 |
total_acc_factor | 0.0105239 |
total_rev_hi_lim_factor | 0.0244212 |
verification_status | 0.0115202 |
The rule of thumb for PSI’s impact is:
PSI value | Conclusion |
---|---|
PSI < 0.1 | No change. You can continue using existing model. |
PSI >=0.1 | but less than 0.2 - Slight change is required. |
PSI >=0.2 | Significant change is required. Ideally, you should not use this model any more. |
Based on this table, we conclude that initial_list_status_trnsfrmd
with PSI contribution of 0.3466538 has changed subtantially. Hence, let us examine this closely to see what exactly has changed.
kable(freq_table %>% filter(column_name == 'initial_list_status_trnsfrmd'), caption='Population Stability Index Table')
categories | freq_train | column_name | freq_new | contribution |
---|---|---|---|---|
f | 0.5150067 | initial_list_status_trnsfrmd | 0.2350205 | 0.2196512 |
w | 0.4849933 | initial_list_status_trnsfrmd | 0.7649795 | 0.1275936 |
Notice the proportion of w
and f
are quite different for old and new data, but this represent the change in the strategy of the bank rather than change in the population. Hence, it is not a cause to change the model.
The second variable that has relatively higher PSI contribution is score_cat
, which is categorised score. This has a value of 0.1480015, meaning a slight change of model is needed, in order to apply model to the new data.
To calculate Expected loss of a loan portfolio we need 3 quantities, 1) Probability of Default, 2) Loss Given Default and 3) Exposure at Default. We have already calculated PD and now we can turn to calculating the other two, i.e., LGD and EAD.
In order to calculate LGD and EAD we first have to filter on loan_status
: ‘Charged Off’ and ‘Does not meet the credit policy. Status:Charged Off’, as these are the loans that have had enough time for recoveries to have been completed. We then calculate recovery rate and Credit Conversion Factor (CCF) for these loans, divide the data into train and test and fit the model. As LGD’s can any value between 0 and 1, they are generally modeled using Beta Regression, instead of logistic regression used for PD.
library(betareg)
library(MLmetrics)
# concatenate train and test data to get full default data
full_data <- train_data %>%
bind_rows(test_data)
# calculate recovery rate and ccf
default_data <- full_data %>%
filter(loan_status %in% c('Charged Off','Does not meet the credit policy. Status:Charged Off')) %>%
mutate(mths_since_last_delinq = replace_na(mths_since_last_delinq, 0),
mths_since_last_record = replace_na(mths_since_last_record, 0),
recovery_rate = recoveries / funded_amnt) %>%
mutate(recovery_rate = ifelse(recovery_rate <= 0, 0.001,
ifelse(recovery_rate >= 1, 0.999,
recovery_rate))) %>%
mutate(ccf = (funded_amnt - total_rec_prncp) / funded_amnt)
# divide full default data into training and testing datasets.
train_index <- createDataPartition(default_data$recovery_rate,p=0.8,list=FALSE)
train_default <- default_data[train_index,]
test_default <- default_data[-train_index,]
# list of relevant fields and keep only these columns
default_cols <- c('grade',
'home_ownership',
'verification_status',
'purpose',
'initial_list_status',
'term_int',
'emp_length_int',
'mths_since_issue_d',
'mths_since_earliest_cr_line',
'funded_amnt',
'int_rate',
'installment',
'annual_inc',
'dti',
'delinq_2yrs',
'inq_last_6mths',
'mths_since_last_delinq',
'mths_since_last_record',
'open_acc',
'pub_rec',
'total_acc',
'acc_now_delinq',
'total_rev_hi_lim',
'recovery_rate')
# convert character columns into factors.
lgd_input_train <- train_default %>%
select(all_of(default_cols)) %>%
mutate_if(is.character,as.factor)
lgd_input_test <- test_default %>%
select(all_of(default_cols)) %>%
mutate_if(is.character,as.factor)
# Beta regression
lgd_model <- betareg(recovery_rate ~., data=lgd_input_train)
After fitting our model with Beta regression, we predict LGD on both training and testing data using coefficients obtained from fitting our data. The reason we are predicting LGD for training set also has to do with checking if our model is over/underfitting the data. We calculate RMSE and MSE for both training and testing predictions vs actual numbers. If these quantities are close enough for training and testing predictions, this means that our model does not suffer from over/underfitting.
predict_test <- predict(lgd_model, lgd_input_test, type = 'response')
predict_train <- predict(lgd_model, lgd_input_train, type = 'response')
print(paste0("MSE for testing dataset is: ", MSE(predict_test, lgd_input_test$recovery_rate)))
## [1] "MSE for testing dataset is: 0.00782598568596409"
print(paste0("RMSE for testing dataset is: ", RMSE(predict_test, lgd_input_test$recovery_rate)))
## [1] "RMSE for testing dataset is: 0.0884646013158037"
print(paste0("MSE for training dataset is: ", MSE(predict_train, lgd_input_train$recovery_rate)))
## [1] "MSE for training dataset is: 0.00773418256727992"
print(paste0("RMSE for training dataset is: ", RMSE(predict_train, lgd_input_train$recovery_rate)))
## [1] "RMSE for training dataset is: 0.0879442014420503"
As we can see that MSE and RMSE for testing and training set are quite close to each other, our model has done as good fitting as possible.
The majority of steps for modelling EAD are same as LGD with two key differences: 1) The target variable to fit is ccf
instead of recovery_rate
. 2) To model EAD we use Generalised Linear Regression instead of Beta regression as done for LGD.
default_cols <- c('grade',
'home_ownership',
'verification_status',
'purpose',
'initial_list_status',
'term_int',
'emp_length_int',
'mths_since_issue_d',
'mths_since_earliest_cr_line',
'funded_amnt',
'int_rate',
'installment',
'annual_inc',
'dti',
'delinq_2yrs',
'inq_last_6mths',
'mths_since_last_delinq',
'mths_since_last_record',
'open_acc',
'pub_rec',
'total_acc',
'acc_now_delinq',
'total_rev_hi_lim',
'ccf')
ead_input_train <- train_default %>%
select(all_of(default_cols)) %>%
mutate_if(is.character,as.factor)
ead_input_test <- test_default %>%
select(all_of(default_cols)) %>%
mutate_if(is.character,as.factor)
sapply(ead_input_train, function(y) sum(length(which(is.na(y)))))
## grade home_ownership
## 0 0
## verification_status purpose
## 0 0
## initial_list_status term_int
## 0 0
## emp_length_int mths_since_issue_d
## 0 0
## mths_since_earliest_cr_line funded_amnt
## 0 0
## int_rate installment
## 0 0
## annual_inc dti
## 0 0
## delinq_2yrs inq_last_6mths
## 0 0
## mths_since_last_delinq mths_since_last_record
## 0 0
## open_acc pub_rec
## 0 0
## total_acc acc_now_delinq
## 0 0
## total_rev_hi_lim ccf
## 0 0
# ead_model <- glm(ccf ~., family = binomial(link='logit'), data=ead_input_train)
ead_model <- glm(ccf ~., data=ead_input_train)
Again we check our model for over/underfitting bias.
predict_test <- predict(ead_model, ead_input_test, type = 'response')
predict_train <- predict(ead_model, ead_input_train, type = 'response')
print(paste0("MSE for testing dataset is: ", MSE(predict_test, ead_input_test$ccf)))
## [1] "MSE for testing dataset is: 0.0274414936721869"
print(paste0("RMSE for testing dataset is: ", RMSE(predict_test, ead_input_test$ccf)))
## [1] "RMSE for testing dataset is: 0.165654742377593"
print(paste0("MSE for training dataset is: ", MSE(predict_train, ead_input_train$ccf)))
## [1] "MSE for training dataset is: 0.0283002547548188"
print(paste0("RMSE for training dataset is: ", RMSE(predict_train, ead_input_train$ccf)))
## [1] "RMSE for training dataset is: 0.168226795591008"