Project Description

In this project we will model lending club data available at: https://www.kaggle.com/husainsb/lendingclub-issued-loans#lc_loan.csv We start with a data between 2007-2015. We will first analyse features, calculate Weight of Evidence and Information Value. Based on that, we will use Generalised logistic regression to calculate probability of default, and finally use coefficients of logistic regression to generate a credit score card.

We will also validate the performance of our predictions. After that, we will predict EAD and LGD.

Next, we use the data for 2016-2017 and treat it as a new data that needs to be integrated with our model. However, before we can do that we have to quantify how similar/dissimilar new data is from existing data. We will do this via Population Stability Index(PSI).

Pre-process input data

Let us start by loading data and dropping all columns with all the values missing, as these columns are of no use anyways.

library(tidyverse)
library(dplyr)
library(readxl)
library(stringr)
library(purrr)
library(lubridate)
library(caret)
library(ggplot2)
library(data.table)
library(knitr)
loan_data <- fread("C:\\Asus WebStorage\\nitin.7785@gmail.com\\MySyncFolder\\Credit_Risk_Modelling\\Credit-Risk-Model\\publish_git\\data\\lc_loan.csv")

Our data has following columns,

print(colnames(loan_data))
##  [1] "id"                          "member_id"                  
##  [3] "loan_amnt"                   "funded_amnt"                
##  [5] "funded_amnt_inv"             "term"                       
##  [7] "int_rate"                    "installment"                
##  [9] "grade"                       "sub_grade"                  
## [11] "emp_title"                   "emp_length"                 
## [13] "home_ownership"              "annual_inc"                 
## [15] "verification_status"         "issue_d"                    
## [17] "loan_status"                 "pymnt_plan"                 
## [19] "url"                         "desc"                       
## [21] "purpose"                     "title"                      
## [23] "zip_code"                    "addr_state"                 
## [25] "dti"                         "delinq_2yrs"                
## [27] "earliest_cr_line"            "inq_last_6mths"             
## [29] "mths_since_last_delinq"      "mths_since_last_record"     
## [31] "open_acc"                    "pub_rec"                    
## [33] "revol_bal"                   "revol_util"                 
## [35] "total_acc"                   "initial_list_status"        
## [37] "out_prncp"                   "out_prncp_inv"              
## [39] "total_pymnt"                 "total_pymnt_inv"            
## [41] "total_rec_prncp"             "total_rec_int"              
## [43] "total_rec_late_fee"          "recoveries"                 
## [45] "collection_recovery_fee"     "last_pymnt_d"               
## [47] "last_pymnt_amnt"             "next_pymnt_d"               
## [49] "last_credit_pull_d"          "collections_12_mths_ex_med" 
## [51] "mths_since_last_major_derog" "policy_code"                
## [53] "application_type"            "annual_inc_joint"           
## [55] "dti_joint"                   "verification_status_joint"  
## [57] "acc_now_delinq"              "tot_coll_amt"               
## [59] "tot_cur_bal"                 "open_acc_6m"                
## [61] "open_il_6m"                  "open_il_12m"                
## [63] "open_il_24m"                 "mths_since_rcnt_il"         
## [65] "total_bal_il"                "il_util"                    
## [67] "open_rv_12m"                 "open_rv_24m"                
## [69] "max_bal_bc"                  "all_util"                   
## [71] "total_rev_hi_lim"            "inq_fi"                     
## [73] "total_cu_tl"                 "inq_last_12m"

Let us examine two columns emp_length and term

distinct(loan_data, emp_length)
distinct(loan_data, term)

We would like to convert them from string to numeric,

numextract <- function(string){
  str_extract(string, "\\-*\\d+\\.*\\d*")
}
loan_data <- loan_data %>%
  mutate(emp_length_int = gsub("< 1 year", "0", emp_length))

loan_data$emp_length_int <- map_chr(loan_data$emp_length_int, numextract) %>%
  as.double() %>%
replace_na(0)

loan_data <- loan_data %>%
  mutate(term_int = ifelse(term == '36 months', 36, 60))
distinct(loan_data, term_int)

Next we convert mths_since_earliest_cr_line and mths_since_earliest_cr_line, to date and calculate months upto 01-12-2017.

loan_data$mths_since_earliest_cr_line <- interval(parse_date_time(loan_data$earliest_cr_line, "by"),
                                                   as.POSIXct("2017-12-01"))%/% months(1)

loan_data$mths_since_earliest_cr_line <- ifelse(loan_data$mths_since_earliest_cr_line < 0,
                                               max(loan_data$mths_since_earliest_cr_line, na.rm = TRUE),
                                               loan_data$mths_since_earliest_cr_line)


loan_data$mths_since_issue_d <- interval(parse_date_time(loan_data$issue_d, "by"),
                                         as.POSIXct("2017-12-01"))%/% months(1)

Check for missing values and fill them with as appropriate,

# sapply(loan_data, function(y) sum(length(which(is.na(y)))))

# 'Total revolving high credit/ credit limit', so it makes sense that the missing values are equal to funded_amnt.
loan_data$total_rev_hi_lim <- ifelse(is.na(loan_data$total_rev_hi_lim),
                                     loan_data$funded_amnt, loan_data$total_rev_hi_lim)

# missing annual income is replace by mean annual income
loan_data$annual_inc <- ifelse(is.na(loan_data$annual_inc),
                               mean(loan_data$annual_inc, na.rm = TRUE), loan_data$annual_inc)

# for the rest fill Na with 0
loan_data$mths_since_earliest_cr_line <- ifelse(is.na(loan_data$mths_since_earliest_cr_line),
                                                0, loan_data$mths_since_earliest_cr_line)

loan_data$acc_now_delinq <- ifelse(is.na(loan_data$acc_now_delinq), 0, loan_data$acc_now_delinq)
loan_data$total_acc <- ifelse(is.na(loan_data$total_acc), 0, loan_data$total_acc)
loan_data$pub_rec <- ifelse(is.na(loan_data$pub_rec), 0, loan_data$pub_rec)
loan_data$open_acc <- ifelse(is.na(loan_data$open_acc), 0, loan_data$open_acc)
loan_data$inq_last_6mths <- ifelse(is.na(loan_data$inq_last_6mths), 0, loan_data$inq_last_6mths)
loan_data$delinq_2yrs <- ifelse(is.na(loan_data$delinq_2yrs), 0, loan_data$delinq_2yrs)
loan_data$emp_length_int <- ifelse(is.na(loan_data$emp_length_int), 0, loan_data$emp_length_int)

As we will not work with all the columns, we only keep columns listed below,

# remove unwanted columns
keep_columns <- c("id", "grade",
                  "home_ownership", "loan_status", "addr_state",
                  "verification_status", "purpose",
                  "initial_list_status", "term_int",
                  "emp_length_int", "mths_since_issue_d",
                  "int_rate", "funded_amnt",
                  "mths_since_earliest_cr_line", "delinq_2yrs",
                  "inq_last_6mths", "open_acc",
                  "pub_rec", "total_acc",
                  "acc_now_delinq", "total_rev_hi_lim",
                  "installment", "annual_inc",
                  "mths_since_last_delinq", "dti",
                  "mths_since_last_record", "total_rec_prncp", 
                  "total_rec_int", "total_rec_late_fee", "recoveries")
loan_data <- loan_data %>%
  select(keep_columns)

To model PD we need to convert each loan_status to default or no-default. Lets first examine the different types of loan_status we have in the data,

print(distinct(loan_data, loan_status))
##                                             loan_status
##  1:                                          Fully Paid
##  2:                                         Charged Off
##  3:                                             Current
##  4:                                             Default
##  5:                                  Late (31-120 days)
##  6:                                     In Grace Period
##  7:                                   Late (16-30 days)
##  8:  Does not meet the credit policy. Status:Fully Paid
##  9: Does not meet the credit policy. Status:Charged Off
## 10:                                              Issued

We then convert loan_status to good/bad behaviour,

distinct(loan_data, loan_status)
loan_data %>%
  group_by(loan_status) %>%
  summarise(n_distinct(id)/nrow(loan_data))
# create good_bad (default or not) column from loan_status and observe the frequency of good/bad loans
bad_status <- c('Charged Off', 'Default',
                'Does not meet the credit policy. Status:Charged Off',
                'Late (31-120 days)')
loan_data$good_bad <- ifelse(loan_data$loan_status %in% bad_status, 0, 1)
loan_data %>% group_by(good_bad) %>% summarise(n_distinct(id)/nrow(loan_data))

So, now our data has been preprocessed and we can move to feature exploration and feature engineering to prepare our data for logistic regression. Lets first divide our data into train and test,

train_index <- createDataPartition(loan_data$good_bad,p=0.9,list=FALSE)
train_data <- loan_data[train_index,]
test_data <- loan_data[-train_index,]

We will engineer/modify our columns based on their Weight of Evidence (WoE) and Information Value (IV). Since we will be doing it for many columns, it makes sense to write a function for calculating WoE. To decide on the suitable categories of a feature, we need to look at WoE per original category and if possible combine various categories to reduce the number of parameters in our logistic regression model. To inspect WoE for each feature per category is best done in visual way and we write function plot WoE per feature. Since our data can be either continuous or discrete in nature, we need separate functions to deal with each type.

calculate_weight_of_evidence_discrete <- function(train_data, WoE_col, target_col) {
  df_input <- train_data
  df1 <- df_input %>%
    select_at(c(WoE_col, target_col))
  df_n_obs <- df1 %>% group_by_at(WoE_col) %>% summarise(n_obs = n())

  df_good_prop <- df1 %>% group_by_at(WoE_col) %>% summarise(prop_good = mean(get(target_col)))
  # eval(as.name(target_col)))
  df_final <- df_n_obs %>%
    left_join(df_good_prop)

  df_final$prop_n_obs <- df_final$n_obs / sum(df_final$n_obs)
  df_final$n_good <- df_final$prop_good * df_final$n_obs
  df_final$n_bad <- (1-df_final$prop_good) * df_final$n_obs
  df_final$prop_n_good <- df_final$n_good / sum(df_final$n_good)
  df_final$prop_n_bad <- df_final$n_bad / sum(df_final$n_bad)
  df_final$WoE <- log(df_final$prop_n_good / df_final$prop_n_bad)
  df_final <- df_final %>%
    arrange(WoE)
  df_final <- df_final %>%
    mutate(diff_prop_good = prop_good - lag(prop_good, default = first(prop_good)),
           diff_WoE = WoE - lag(WoE, default = first(WoE)))
  df_final$WoE <- replace_na(df_final$WoE, 0)
  df_final$WoE <- ifelse(is.finite(df_final$WoE), df_final$WoE, 0)
  df_final <- df_final %>%
    mutate(IV = sum((prop_n_good - prop_n_bad) * WoE))
  print(paste0(WoE_col, " IV is: ", distinct(df_final, IV)))
  df_final <- df_final %>%
    arrange(WoE) %>%
    mutate(prop_obs = n_obs / sum(n_obs))
}

plot_woe_discrete <- function(plot_data, x_axis, x_label, y_scale, angl) {
  ggplot(data = plot_data, aes(x = reorder(get(x_axis), WoE))) +
    geom_line(aes(y = WoE, group = 1), color = 'blue') +
    geom_point(aes(y = WoE, group = 1), color = 'blue') +
    geom_bar(aes(y = prop_obs / y_scale, group = 1), stat="identity", size=.1, color = 'red', fill = "#69b3a2", alpha=.4) +
    theme(
      axis.title.y = element_text(color = 'blue', size=13),
      axis.title.y.right = element_text(color = 'red', size=13)
    ) +
    theme(axis.text.x = element_text(angle = angl, vjust = 0.5, hjust=1)) +
    scale_y_continuous(sec.axis = sec_axis(~.*y_scale, name = "Observation proportion")) +
    scale_colour_manual(values = c("blue", "red")) +
    labs(y = "Weight of evidence", x = x_label)
}

calculate_weight_of_evidence_continuous <- function(train_data, WoE_col, target_col) {
  df_input <- train_data
  df1 <- df_input %>%
    select_at(c(WoE_col, target_col))
  df_n_obs <- df1 %>% group_by_at(WoE_col) %>% summarise(n_obs = n())

  df_good_prop <- df1 %>% group_by_at(WoE_col) %>% summarise(prop_good = mean(get(target_col)))
  # eval(as.name(target_col)))
  df_final <- df_n_obs %>%
    left_join(df_good_prop)

  df_final$prop_n_obs <- df_final$n_obs / sum(df_final$n_obs)
  df_final$n_good <- df_final$prop_good * df_final$n_obs
  df_final$n_bad <- (1-df_final$prop_good) * df_final$n_obs
  df_final$prop_n_good <- df_final$n_good / sum(df_final$n_good)
  df_final$prop_n_bad <- df_final$n_bad / sum(df_final$n_bad)
  df_final$WoE <- log(df_final$prop_n_good / df_final$prop_n_bad)
  df_final <- df_final %>%
    arrange(WoE)
  df_final <- df_final %>%
    mutate(diff_prop_good = prop_good - lag(prop_good, default = first(prop_good)),
           diff_WoE = WoE - lag(WoE, default = first(WoE)))
  df_final$WoE <- replace_na(df_final$WoE, 0)
  df_final$WoE <- ifelse(is.finite(df_final$WoE), df_final$WoE, 0)
  df_final <- df_final %>%
           mutate(IV = sum((prop_n_good - prop_n_bad) * WoE))
  print(paste0(WoE_col, " IV is: ", distinct(df_final, IV)))
  df_final <- df_final %>%
    arrange_at(WoE_col) %>%
    mutate(prop_obs = n_obs / sum(n_obs))
}

plot_woe_continuous <- function(plot_data, x_axis, x_label, y_scale) {
  ggplot(data = plot_data, aes(x = get(x_axis), WoE)) +
    geom_line(aes(y = WoE, group = 1), color = 'blue') +
    geom_point(aes(y = WoE, group = 1), color = 'blue') +
    geom_bar(aes(y = prop_obs / y_scale, group = 1), stat="identity", size=.1, color = 'red', fill = "#69b3a2", alpha=.4) +
    theme(
      axis.title.y = element_text(color = 'blue', size=13),
      axis.title.y.right = element_text(color = 'red', size=13)
    ) +
    scale_x_continuous(breaks = seq(min(plot_data %>% select_at(x_axis)), max(plot_data %>% select_at(x_axis)), by = 1)) +
    scale_y_continuous(sec.axis = sec_axis(~.*y_scale, name = "Observation proportion")) +
    scale_colour_manual(values = c("blue", "red")) +
    labs(y = "Weight of evidence", x = x_label)
}

Feature engineering

Lets start inspecting WoE of features and categorize them accordingly. However, before we start feature inspection, we initialize a dataframe to keep track of Information Value of each feature to be able to identify redundant features based on their respective IV.

IV_df <- data.frame(matrix(ncol = 2, nrow = 0))
colnames(IV_df) <- c('WoE_col', 'IV_value')

grade

We are all set to explore and engineer features, lets start with grade. As can be seen in WoE plot, there are not that many grades and the grades when arranged in ascending order of their WoE have a clear linear trend. It makes sense to preserve all grades categories.

grade_woe <- calculate_weight_of_evidence_discrete(train_data, "grade", "good_bad")
## [1] "grade IV is: 0.322122592378725"
IV_df[nrow(IV_df) + 1,] <- list("grade", distinct(grade_woe, IV))
plot_woe_discrete(grade_woe, 'grade', 'Grade', 1, 0)

train_data$grade_trnsfrmd <- train_data$grade

home_ownership

Next we examine home_ownership.

distinct(train_data, home_ownership)
home_ownership_woe <- calculate_weight_of_evidence_discrete(train_data, "home_ownership", "good_bad")
## [1] "home_ownership IV is: 0.0228438412849011"
IV_df[nrow(IV_df) + 1,] <- list("home_ownership", distinct(home_ownership_woe, IV))
IV_df[nrow(IV_df) + 1,] <- list("grade", distinct(home_ownership_woe, IV))
plot_woe_discrete(home_ownership_woe, 'home_ownership', 'Home Ownership', 5, 0)

combine_home_ownership <- c("RENT", "OTHER", "NONE", "ANY")
train_data$home_ownership_trnsfrmd <- ifelse(train_data$home_ownership %in% combine_home_ownership, "RENT_OTHER_NONE_ANY", train_data$home_ownership)

WoE plot shows large negative WoE values for OTHER and NONE, but notice the small percentage of observations with those categories, this is also true for category ANY. Hence, they are not reliable and can be clubed together with RENT and ANY. The next two categories OWN and MORTGAGE have substantial number of observations and differing WoE and they can be kept as separate categories (see plot below where OTHER and NONE have been removed for better visibility of frequency of observations).

plot_woe_discrete(home_ownership_woe %>% slice(3:n()), 'home_ownership', 'Home Ownership', 5, 0)

In general, this is the principle that will be followed to decide on categories for PD modelling. Below, we will repeat these steps for all the other features of the input data. From here onwards, we proceed with all other feature categorisation without additional explanation as the basic rational of selecting categories has been explained above.

addr_state

state_woe <- calculate_weight_of_evidence_discrete(train_data, "addr_state", "good_bad")
## [1] "addr_state IV is: 0.017898691052361"
IV_df[nrow(IV_df) + 1,] <- list("addr_state", distinct(state_woe, IV))
plot_woe_discrete(state_woe %>% slice(2:(n()-1)), 'addr_state', 'State', 0.2, 0)

state1 <- c("NV", "NE", "IA", "ME")
state2 <- c("HI", "FL", "AL", "LA")
state3 <- c("NY")
state4 <- c("NC", "MD", "NM", "VA", "NJ", "UT", "MO")
state5 <- c("CA")
state6 <- c("AZ", "AR", "MI")
state7 <- c("OK", "TN", "PA")
state8 <- c("DE", "MA", "RI", "KY", "MN", "OH", "SD", "IN", "OR", "GA", "WA", "ID", "WI", "MT")
state9 <- c("TX")
state10 <- c("VT", "CT", "IL", "AK", "SC", "CO", "KS", "MS")
state11 <- c("NH", "WV", "WY")
state12 <- c("DC")

train_data$addr_state_trnsfrmd <- ifelse(train_data$addr_state %in% state1, "state_list_1",
                                   ifelse(train_data$addr_state %in% state2, "state_list_2",
                                    ifelse(train_data$addr_state %in% state3, "state_list_3",
                                     ifelse(train_data$addr_state %in% state4, "state_list_4",
                                      ifelse(train_data$addr_state %in% state5, "state_list_5",
                                       ifelse(train_data$addr_state %in% state6, "state_list_6",
                                        ifelse(train_data$addr_state %in% state7, "state_list_7",
                                         ifelse(train_data$addr_state %in% state8, "state_list_8",
                                          ifelse(train_data$addr_state %in% state9, "state_list_9",
                                           ifelse(train_data$addr_state %in% state10, "state_list_10",
                                            ifelse(train_data$addr_state %in% state11, "state_list_11",
                                             ifelse(train_data$addr_state %in% state12, "state_list_12",
                                              NaN))))))))))))

verification_status

verification_status_woe <- calculate_weight_of_evidence_discrete(train_data, "verification_status", "good_bad")
## [1] "verification_status IV is: 0.0359161144044958"
IV_df[nrow(IV_df) + 1,] <- list("verification_status", distinct(verification_status_woe, IV))
plot_woe_discrete(verification_status_woe, 'verification_status', 'Verification Status', 5, 0)

train_data$verification_status_trnsfrmd <- ifelse(train_data$verification_status %in%
                                                    c("Not Verified", "Source Verified"),
                                                  "Not or Source Verified",
                                                  train_data$verification_status)

purpose

purpose_woe <- calculate_weight_of_evidence_discrete(train_data, "purpose", "good_bad")
## [1] "purpose IV is: 0.0502648847593456"
IV_df[nrow(IV_df) + 1,] <- list("purpose", distinct(purpose_woe, IV))
plot_woe_discrete(purpose_woe, 'purpose', 'Purpose', 2, 0)

purpose1 <- c("small_business", "educational", "renewable_energy", "moving")
purpose2 <- c("other", "medical", "house")
purpose3 <- c("debt_consolidation", "vacation", "home_improvement")
purpose4 <- c("wedding", "car", "major_purchase", "credit_card")
train_data$purpose_trnsfrmd <- ifelse(train_data$purpose %in% purpose1, "purpose_list_1",
                                ifelse(train_data$purpose %in% purpose2, "purpose_list_2",
                                 ifelse(train_data$purpose %in% purpose3, "purpose_list_3",
                                  ifelse(train_data$purpose %in% purpose4, "purpose_list_4", NaN))))

initial_list_status

init_list_status_woe <- calculate_weight_of_evidence_discrete(train_data, "initial_list_status", "good_bad")
## [1] "initial_list_status IV is: 0.153102356011061"
IV_df[nrow(IV_df) + 1,] <- list("initial_list_status", distinct(init_list_status_woe, IV))
plot_woe_discrete(init_list_status_woe, 'initial_list_status', 'Initial List Status', 2, 0)

train_data$initial_list_status_trnsfrmd <- train_data$initial_list_status

term_int

term_int_woe <- calculate_weight_of_evidence_continuous(train_data, 'term_int', 'good_bad')
## [1] "term_int IV is: 0.0208232197620997"
IV_df[nrow(IV_df) + 1,] <- list("term_int", distinct(term_int_woe, IV))
plot_woe_continuous(term_int_woe, 'term_int', 'Loan Term', 2)

train_data$term_int_trnsfrmd <- ifelse(train_data$term_int == 36, 'term_36',
                                       ifelse(train_data$term_int == 60, 'term_60', NaN))

emp_length_int

emp_length_int_woe <- calculate_weight_of_evidence_continuous(train_data, 'emp_length_int', 'good_bad')
## [1] "emp_length_int IV is: 0.00947114613881647"
IV_df[nrow(IV_df) + 1,] <- list("emp_length_int", distinct(emp_length_int_woe, IV))
plot_woe_continuous(emp_length_int_woe %>% arrange(emp_length_int), 'emp_length_int', 'Employment Length', 3)

train_data$emp_length_int_trnsfrmd <- ifelse(train_data$emp_length_int %in% c(0), 'emp_length_0',
                                       ifelse(train_data$emp_length_int %in% c(1,2,3,4), 'emp_length_1_4',
                                        ifelse(train_data$emp_length_int %in% c(5,6), 'emp_length_5_6',
                                         ifelse(train_data$emp_length_int %in% c(7,8,9), 'emp_length_7_9',
                                            'emp_length_10'))))

mths_since_issue_d

mths_since_issue_d_woe <- calculate_weight_of_evidence_continuous(train_data, 'mths_since_issue_d', 'good_bad')
## [1] "mths_since_issue_d IV is: 0.918043906280894"
IV_df[nrow(IV_df) + 1,] <- list("mths_since_issue_d", distinct(mths_since_issue_d_woe, IV))
plot_woe_continuous(mths_since_issue_d_woe, 'mths_since_issue_d', 'Months Since Issue Date', 0.1)

train_data$mths_since_issue_d_trnsfrmd <- ifelse(train_data$mths_since_issue_d <= 55, paste0('mths_since_issue_date=',str(train_data$mths_since_issue_d)),
                                           ifelse(train_data$mths_since_issue_d %in% c(56,57,58,59,60,61,62,63), 'mths_since_issue_date>=56<=63',
                                            ifelse(train_data$mths_since_issue_d %in% seq(64,84), 'mths_since_issue_date>63<=84',
                                             ifelse(train_data$mths_since_issue_d > 84, 'mths_since_issue_data>84', NaN))))
##  num [1:798642] 71 71 71 71 71 71 71 71 71 71 ...

int_rate_factor

train_data <- train_data %>%
  arrange(int_rate)
train_data$int_rate_factor <- cut(train_data$int_rate, 50)
int_rate_factor_woe <- calculate_weight_of_evidence_continuous(train_data, 'int_rate_factor', 'good_bad')
## [1] "int_rate_factor IV is: 0.556944351873259"
IV_df[nrow(IV_df) + 1,] <- list("int_rate_factor", distinct(int_rate_factor_woe, IV))
plot_woe_discrete(int_rate_factor_woe, 'int_rate_factor', 'Interest rate factor', 0.02, 90)

train_data$int_rate_factor_trnsfrmd <- ifelse(train_data$int_rate <= 9.548, 'int_rate<9.548',
                                        ifelse(train_data$int_rate <= 12.025, '9.548<int_rate<=12.025',
                                          ifelse(train_data$int_rate <= 15.74, '12.025<int_rate<=15.74',
                                           ifelse(train_data$int_rate <= 20.281, '15.74<int_rate<20.281',
                                            ifelse(train_data$int_rate >20.281, 'int_rate>20.281',NaN)))))

funded_amnt_factor

train_data$funded_amnt_factor <- cut(train_data$funded_amnt, 50)
funded_amnt_factor_woe <- calculate_weight_of_evidence_continuous(train_data, 'funded_amnt_factor', 'good_bad')
## [1] "funded_amnt_factor IV is: 0.0191689251632706"
IV_df[nrow(IV_df) + 1,] <- list("funded_amnt_factor", distinct(funded_amnt_factor_woe, IV))
plot_woe_discrete(funded_amnt_factor_woe, 'funded_amnt_factor', 'Funded amount factor', 0.02, 90)

train_data$funded_amnt_factor_trnsfrmd <- ifelse(train_data$funded_amnt <= 1000, 'funded_amnt:<5K',
                                           ifelse(train_data$funded_amnt <= 10000, 'funded_amnt:5K-10K',
                                            ifelse(train_data$funded_amnt <= 20000, 'funded_amnt:10K:20K',
                                             ifelse(train_data$funded_amnt <= 30000, 'funded_amnt:20K:30:',
                                              ifelse(train_data$funded_amnt <= 50000, 'funded_amnt:30K:50K',
                                              ifelse(train_data$funded_amnt <= 70000, 'funded_amnt:50K:70K',                                                 ifelse(train_data$funded_amnt <= 100000, 'funded_amnt:70K:100K',                                               ifelse(train_data$funded_amnt > 100000, 'funded_amnt:>100K',
                                              NaN))))))))

mths_since_earliest_cr_line_factor

train_data <- train_data %>%
  arrange(mths_since_earliest_cr_line)
train_data$mths_since_earliest_cr_line_factor <- cut(train_data$mths_since_earliest_cr_line, 50)
mths_since_earliest_cr_line_woe <- calculate_weight_of_evidence_continuous(train_data, 'mths_since_earliest_cr_line_factor', 'good_bad')
## [1] "mths_since_earliest_cr_line_factor IV is: 0.0151324882437869"
IV_df[nrow(IV_df) + 1,] <- list("mths_since_earliest_cr_line", distinct(mths_since_earliest_cr_line_woe, IV))
plot_woe_discrete(mths_since_earliest_cr_line_woe, 'mths_since_earliest_cr_line_factor', 'Months since earliest credit line', 0.1, 90)

train_data$mths_since_earliest_cr_line_factor_trnsfrmd <- ifelse(train_data$mths_since_earliest_cr_line <= 140, 'mths_since_earliest_cr_line<=140',
                                                          ifelse(train_data$mths_since_earliest_cr_line <= 246, '140<mths_since_earliest_cr_line<=246',
                                                          ifelse(train_data$mths_since_earliest_cr_line <= 270, '246<mths_since_earliest_cr_line<=270',
                                                          ifelse(train_data$mths_since_earliest_cr_line <= 293, '270<mths_since_earliest_cr_line<=293',
                                                          ifelse(train_data$mths_since_earliest_cr_line <= 398, '293<mths_since_earliest_cr_line<=398',
                                                          ifelse(train_data$mths_since_earliest_cr_line >398, 'mths_since_earliest_cr_line>398',
                                                          NaN))))))

delinq_2yrs

delinq_2yrs_woe <- calculate_weight_of_evidence_continuous(train_data, 'delinq_2yrs', 'good_bad')
## [1] "delinq_2yrs IV is: 0.00106847433774323"
IV_df[nrow(IV_df) + 1,] <- list("delinq_2yrs", distinct(delinq_2yrs_woe, IV))
plot_woe_continuous(delinq_2yrs_woe, 'delinq_2yrs', 'Deliquency in last 2 years', 2)

train_data$delinq_2yrs_trnsfrmd <- ifelse(train_data$delinq_2yrs == 0, 'delinq_2yrs=0',
                                    ifelse(train_data$delinq_2yrs == 1, 'delinq_2yrs=1',
                                     ifelse(train_data$delinq_2yrs == 2, 'delinq_2yrs=2',
                                      ifelse(train_data$delinq_2yrs > 2, 'delinq_2yrs>2',
                                             NaN))))

inq_last_6mths

inq_last_6mths_woe <- calculate_weight_of_evidence_continuous(train_data, 'inq_last_6mths', 'good_bad')
## [1] "inq_last_6mths IV is: 0.0925669321100512"
IV_df[nrow(IV_df) + 1,] <- list("inq_last_mths", distinct(inq_last_6mths_woe, IV))
plot_woe_continuous(inq_last_6mths_woe, 'inq_last_6mths', 'Inq last 6 months', 1)

train_data$inq_last_6mths_trnsfrmd <- ifelse(train_data$inq_last_6mths == 0, 'inq_last_6mths=0',
                                       ifelse(train_data$inq_last_6mths == 1, 'inq_last_6mths=1',
                                        ifelse(train_data$inq_last_6mths == 2, 'inq_last_6mths=2',
                                         ifelse(train_data$inq_last_6mths == 3, 'inq_last_6mths=3',
                                          ifelse(train_data$inq_last_6mths > 3, 'inq_last_6mths>3',
                                                 NaN)))))

open_acc

open_acc_woe <- calculate_weight_of_evidence_continuous(train_data, 'open_acc', 'good_bad')
## [1] "open_acc IV is: 0.00856851247687463"
IV_df[nrow(IV_df) + 1,] <- list("open_acc", distinct(open_acc_woe, IV))
plot_woe_continuous(open_acc_woe, 'open_acc', 'Open Account', 0.1)

plot_woe_continuous(open_acc_woe %>% filter(open_acc>3, open_acc<20), 'open_acc', 'Open Account', 2)

train_data$open_acc_trnsfrmd <- ifelse(train_data$open_acc %in% c(0,1,2,3,seq(21,max(train_data$open_acc))), 'open_acc=0,1,2,3,21-max',
                                 ifelse(train_data$open_acc %in% c(4,5,6,7), 'open_acc=4,5,6,7',
                                  ifelse(train_data$open_acc %in% c(8), 'open_acc=8',
                                   ifelse(train_data$open_acc %in% c(9,10,11,12), 'open_acc=9,10,11,12',
                                    ifelse(train_data$open_acc %in% c(13,14,15,16), 'open_acc=13,14,15,16',
                                     ifelse(train_data$open_acc %in% c(17), 'open_acc=17',
                                      ifelse(train_data$open_acc %in% c(18,19,20), 'open_acc=18,19,20',
                                             NaN)))))))

pub_rec

pub_rec_woe <- calculate_weight_of_evidence_continuous(train_data, 'pub_rec', 'good_bad')
## [1] "pub_rec IV is: 0.00528665682035972"
IV_df[nrow(IV_df) + 1,] <- list("pub_rec", distinct(pub_rec_woe, IV))
plot_woe_continuous(pub_rec_woe, 'pub_rec', 'Public Records', 1)

train_data$pub_rec_trnsfrmd <- ifelse(train_data$pub_rec == 0, 'pub_rec=0',
                                ifelse(train_data$pub_rec == 1, 'pub_rec=1',
                                 ifelse(train_data$pub_rec == 2, 'pub_rec=2',
                                  ifelse(train_data$pub_rec >2, 'pub_rec>=3',
                                         NaN))))

total_acc_factor

train_data <- train_data %>%
  arrange(total_acc)
train_data$total_acc_factor <- cut(train_data$total_acc, 50)
total_acc_woe <- calculate_weight_of_evidence_continuous(train_data, 'total_acc_factor', 'good_bad')
## [1] "total_acc_factor IV is: 0.0100298483435379"
IV_df[nrow(IV_df) + 1,] <- list("total_acc_factor", distinct(total_acc_woe, IV))
plot_woe_discrete(total_acc_woe, 'total_acc_factor', 'Total account factor', 0.1, 90)

train_data$total_acc_factor_trnsfrmd <- ifelse(train_data$total_acc <= 10, 'total_acc<=10',
                                         ifelse(train_data$total_acc <= 25, 'total_acc<=25',
                                          ifelse(train_data$total_acc <= 50, 'total_acc<=50',
                                           ifelse(train_data$total_acc > 50, 'total_acc>50',
                                                   NaN))))

acc_now_delinq

acc_now_delinq_woe <- calculate_weight_of_evidence_continuous(train_data, 'acc_now_delinq', 'good_bad')
## [1] "acc_now_delinq IV is: 0.000114176389107013"
IV_df[nrow(IV_df) + 1,] <- list("acc_now_delinq", distinct(acc_now_delinq_woe, IV))
plot_woe_continuous(acc_now_delinq_woe, 'acc_now_delinq', 'Account now delinquent', 1.0)

train_data$acc_now_delinq_trnsfrmd <- ifelse(train_data$acc_now_delinq == 0, 'acc_now_delinq=0',
                                       ifelse(train_data$acc_now_delinq >= 1, 'acc_now_delinq>=1',
                                              NaN))

total_rev_hi_lim_factor

train_data <- train_data %>%
  arrange(total_rev_hi_lim)
train_data$total_rev_hi_lim_factor <- cut(train_data$total_rev_hi_lim, 1000)
total_rev_hi_lim_woe <- calculate_weight_of_evidence_continuous(train_data, 'total_rev_hi_lim_factor', 'good_bad')
## [1] "total_rev_hi_lim_factor IV is: 0.0826053581761765"
IV_df[nrow(IV_df) + 1,] <- list("total_rev_hi_lim_factor", distinct(total_rev_hi_lim_woe, IV))
plot_woe_discrete(total_rev_hi_lim_woe, 'total_rev_hi_lim_factor', 'Total revolving hi limit factor', 1, 90)

train_data$total_rev_hi_lim_factor_trnsfrmd <- ifelse(train_data$total_rev_hi_lim <= 5000, 'total_rev_hi_lim<=5K',
                                               ifelse(train_data$total_rev_hi_lim <= 10000, 'total_rev_hi_lim:5K-10K',
                                                ifelse(train_data$total_rev_hi_lim <= 20000, 'total_rev_hi_lim:10K-20K',
                                                 ifelse(train_data$total_rev_hi_lim <= 30000, 'total_rev_hi_lim:20K-30K',
                                                  ifelse(train_data$total_rev_hi_lim <= 40000, 'total_rev_hi_lim:30K-40K',
                                                   ifelse(train_data$total_rev_hi_lim <= 50000, 'total_rev_hi_lim:40K-50K',
                                                    ifelse(train_data$total_rev_hi_lim <= 100000, 'total_rev_hi_lim:50K-100L',
                                                     ifelse(train_data$total_rev_hi_lim > 100000, 'total_rev_hi_lim>100K',
                                                      NaN))))))))

installment_factor

train_data <- train_data %>% arrange(installment)
train_data$installment_factor <- cut(train_data$installment, 100)
installment_woe <- calculate_weight_of_evidence_continuous(train_data, 'installment_factor', 'good_bad')
## [1] "installment_factor IV is: 0.0282190404686732"
IV_df[nrow(IV_df) + 1,] <- list("installment_factor", distinct(installment_woe, IV))
plot_woe_discrete(installment_woe, 'installment_factor', 'Installment Factor', 0.05, 90)

train_data$installment_factor_trnsfrmd <- ifelse(train_data$installment <= 30, 'installment<=30',
                                          ifelse(train_data$installment <= 100, 'installment:30-100',
                                           ifelse(train_data$installment <= 200, 'installment:100-200',
                                            ifelse(train_data$installment <= 300, 'installment:200-300',
                                             ifelse(train_data$installment <= 400, 'installment:300-400',
                                              ifelse(train_data$installment <= 500, 'installment:400-500',
                                               ifelse(train_data$installment <= 600, 'installment:500:600',                                                                                                              ifelse(train_data$installment <= 700, 'installment:600-700',
                                                 ifelse(train_data$installment <= 800, 'installment:700-800',
                                                  ifelse(train_data$installment > 800, 'installment>800',
                                                          NaN))))))))))

annual_inc_factor

train_data <- train_data %>% arrange(annual_inc)
train_data_temp <- train_data %>%
  filter(annual_inc <= 140000)
train_data_temp$annual_inc_factor <- cut(train_data_temp$annual_inc, 50)
annual_inc_woe <- calculate_weight_of_evidence_continuous(train_data_temp, 'annual_inc_factor', 'good_bad')
## [1] "annual_inc_factor IV is: 0.0523239023770557"
IV_df[nrow(IV_df) + 1,] <- list("annual_inc_factor", distinct(annual_inc_woe, IV))
plot_woe_discrete(annual_inc_woe, 'annual_inc_factor', 'Annual Income Factor', 0.1, 90)

train_data$annual_inc_factor_trnsfrmd <- ifelse(train_data$annual_inc <= 20000, 'annual_inc<=20K',
                                         ifelse(train_data$annual_inc <= 30000, 'annual_inc:20K-30K',
                                          ifelse(train_data$annual_inc <= 40000, 'annual_inc:30K-40K',
                                           ifelse(train_data$annual_inc <= 50000, 'annual_inc:40K-50K',
                                            ifelse(train_data$annual_inc <= 60000, 'annual_inc:50K-60K',
                                             ifelse(train_data$annual_inc <= 70000, 'annual_inc:60K-70K',
                                              ifelse(train_data$annual_inc <= 80000, 'annual_inc:70K-80K',
                                               ifelse(train_data$annual_inc <= 90000, 'annual_inc:80K-90K',
                                                ifelse(train_data$annual_inc <= 100000, 'annual_inc:90K-100K',
                                                 ifelse(train_data$annual_inc <= 120000, 'annual_inc:100K-120K',
                                                  ifelse(train_data$annual_inc <= 140000, 'annual_inc:120K-140K',
                                                   ifelse(train_data$annual_inc > 140000, 'annual_inc:>140K',
                                                          NaN))))))))))))

mths_since_last_delinq_factor

train_data <- train_data %>%
  arrange(mths_since_last_delinq)
train_data$mths_since_last_delinq_factor <- cut(train_data$mths_since_last_delinq, 50)
mths_since_last_delinq_woe <- calculate_weight_of_evidence_continuous(train_data, 'mths_since_last_delinq_factor', 'good_bad')
## [1] "mths_since_last_delinq_factor IV is: 0.00477673892483607"
IV_df[nrow(IV_df) + 1,] <- list("mths_since_last_delinq_factor", distinct(mths_since_last_delinq_woe, IV))
plot_woe_discrete(mths_since_last_delinq_woe, 'mths_since_last_delinq_factor', 'Months since last delinquency factor', 0.5, 90)

train_data$mths_since_last_delinq_factor_trnsfrmd <- ifelse(train_data$mths_since_last_delinq <= 3, 'mths_since_last_delinq<=3',
                                                     ifelse(train_data$mths_since_last_delinq <= 7, 'mths_since_last_delinq:4-7',
                                                      ifelse(train_data$mths_since_last_delinq <= 40, 'mths_since_last_delinq:7-40',
                                                       ifelse(train_data$mths_since_last_delinq <= 80, 'mths_since_last_delinq:40-80',
                                                        ifelse(train_data$mths_since_last_delinq > 80, 'mths_since_last_delinq>80',
                                                         ifelse(is.na(train_data$mths_since_last_delinq), 'Missing',
                                                                NaN))))))
train_data <- train_data %>%
  arrange(dti)
train_data_temp <- train_data %>%
  filter(dti <= 35)
train_data_temp$dti_factor <- cut(train_data_temp$dti, 20) # try with 100 and 50 cuts
train_data <- train_data %>%
  left_join(train_data_temp %>% select(id, dti_factor))
dti_woe <- calculate_weight_of_evidence_continuous(train_data_temp, 'dti_factor', 'good_bad')
## [1] "dti_factor IV is: 0.0131144001658101"
IV_df[nrow(IV_df) + 1,] <- list("dti_factor", distinct(dti_woe, IV))
plot_woe_discrete(dti_woe, 'dti_factor', 'Debt to Income', 0.1, 90)

train_data$dti_factor_trnsfrmd <- ifelse(train_data$dti <= 1, 'dti:<1',
                                   ifelse(train_data$dti <= 2, 'dti:1-2',
                                    ifelse(train_data$dti <= 4, 'dti:2-4',
                                     ifelse(train_data$dti <= 6, 'dti:4-6',
                                      ifelse(train_data$dti <= 8, 'dti:6-8',
                                       ifelse(train_data$dti <= 10, 'dti:8-10',
                                        ifelse(train_data$dti <= 15, 'dti:10-15',
                                         ifelse(train_data$dti <= 20, 'dti:15-20',
                                          ifelse(train_data$dti <= 25, 'dti:20-25',
                                           ifelse(train_data$dti <= 30, 'dti:25-30',
                                            ifelse(train_data$dti <= 35, 'dti:30-35',
                                             ifelse(train_data$dti >35, 'dti:>35',
                                                    NaN))))))))))))

mths_since_last_record_factor

train_data <- train_data %>%
  arrange(mths_since_last_record)
train_data$mths_since_last_record_factor <- cut(train_data$mths_since_last_record, 20)
mths_since_last_record_woe <- calculate_weight_of_evidence_continuous(train_data, 'mths_since_last_record_factor', 'good_bad')
## [1] "mths_since_last_record_factor IV is: 0.0253121582979921"
IV_df[nrow(IV_df) + 1,] <- list("mths_since_last_record_factor", distinct(mths_since_last_record_woe, IV))
plot_woe_discrete(mths_since_last_record_woe, 'mths_since_last_record_factor', 'Months since last record factor', 0.1, 90)

train_data$mths_since_last_record_factor_trnsfrmd <- ifelse(train_data$mths_since_last_record <= 2, 'mths_since_last_record<=2',
                                                     ifelse(train_data$mths_since_last_record <= 30, 'mths_since_last_record:3-30',
                                                      ifelse(train_data$mths_since_last_record <= 65, 'mths_since_last_record:31-65',
                                                       ifelse(train_data$mths_since_last_record <= 80, 'mths_since_last_record:66-80',
                                                        ifelse(train_data$mths_since_last_record > 80, 'mths_since_last_record>80',
                                                         ifelse(is.na(train_data$mths_since_last_record), 'Missing',
                                                                NaN))))))

At the end retain only transformed columns and good/bad flag.

train_data <- train_data %>%
  mutate(mths_since_last_record_factor_trnsfrmd = replace_na(mths_since_last_record_factor_trnsfrmd, 'Missing'))
train_data <- train_data %>%
  mutate(mths_since_last_delinq_factor_trnsfrmd = replace_na(mths_since_last_delinq_factor_trnsfrmd, 'Missing'))

train_data_trnsfrmd <- train_data %>%
  select(ends_with(c('good_bad', 'trnsfrmd')))

Preparing test data

Using exactly the same categories, we transform test data for later to measure the performance of our PD model.

test_data$grade_trnsfrmd <- test_data$grade

test_data$home_ownership_trnsfrmd <- ifelse(test_data$home_ownership %in% combine_home_ownership, "RENT_OTHER_NONE_ANY", test_data$home_ownership)
state1 <- c("NV", "NE", "IA", "ME")
state2 <- c("HI", "FL", "AL", "LA")
state3 <- c("NY")
state4 <- c("NC", "MD", "NM", "VA", "NJ", "UT", "MO")
state5 <- c("CA")
state6 <- c("AZ", "AR", "MI")
state7 <- c("OK", "TN", "PA")
state8 <- c("DE", "MA", "RI", "KY", "MN", "OH", "SD", "IN", "OR", "GA", "WA", "ID", "WI", "MT")
state9 <- c("TX")
state10 <- c("VT", "CT", "IL", "AK", "SC", "CO", "KS", "MS")
state11 <- c("NH", "WV", "WY")
state12 <- c("DC")
test_data$addr_state_trnsfrmd <- ifelse(test_data$addr_state %in% state1, "state_list_1",
                                         ifelse(test_data$addr_state %in% state2, "state_list_2",
                                                ifelse(test_data$addr_state %in% state3, "state_list_3",
                                                       ifelse(test_data$addr_state %in% state4, "state_list_4",
                                                              ifelse(test_data$addr_state %in% state5, "state_list_5",
                                                                     ifelse(test_data$addr_state %in% state6, "state_list_6",
                                                                            ifelse(test_data$addr_state %in% state7, "state_list_7",
                                                                                   ifelse(test_data$addr_state %in% state8, "state_list_8",
                                                                                          ifelse(test_data$addr_state %in% state9, "state_list_9",
                                                                                                 ifelse(test_data$addr_state %in% state10, "state_list_10",
                                                                                                        ifelse(test_data$addr_state %in% state11, "state_list_11",
                                                                                                               ifelse(test_data$addr_state %in% state12, "state_list_12",NaN))))))))))))


test_data$verification_status_trnsfrmd <- ifelse(test_data$verification_status %in% c("Not Verified", "Source Verified"),
                                                  "Not or Source Verified",
                                                  test_data$verification_status)

purpose1 <- c("small_business", "educational", "renewable_energy", "moving")
purpose2 <- c("other", "medical", "house")
purpose3 <- c("debt_consolidation", "vacation", "home_improvement")
purpose4 <- c("wedding", "car", "major_purchase", "credit_card")
test_data$purpose_trnsfrmd <- ifelse(test_data$purpose %in% purpose1, "purpose_list_1",
                                      ifelse(test_data$purpose %in% purpose2, "purpose_list_2",
                                             ifelse(test_data$purpose %in% purpose3, "purpose_list_3",
                                                    ifelse(test_data$purpose %in% purpose4, "purpose_list_4", NaN))))

test_data$initial_list_status_trnsfrmd <- test_data$initial_list_status

test_data$term_int_trnsfrmd <- ifelse(test_data$term_int == 36, 'term_36',
                                       ifelse(test_data$term_int == 60, 'term_60', NaN))

test_data$emp_length_int_trnsfrmd <- ifelse(test_data$emp_length_int %in% c(0), 'emp_length_0',
                                             ifelse(test_data$emp_length_int %in% c(1,2,3,4), 'emp_length_1_4',
                                                    ifelse(test_data$emp_length_int %in% c(5,6), 'emp_length_5_6',
                                                           ifelse(test_data$emp_length_int %in% c(7,8,9), 'emp_length_7_9',
                                                                  'emp_length_10'))))

test_data$mths_since_issue_d_trnsfrmd <- ifelse(test_data$mths_since_issue_d <= 55, paste0('mths_since_issue_date=',str(test_data$mths_since_issue_d)),
                                                 ifelse(test_data$mths_since_issue_d %in% c(56,57,58,59,60,61,62,63), 'mths_since_issue_date>=56<=63',
                                                        ifelse(test_data$mths_since_issue_d %in% seq(64,84), 'mths_since_issue_date>63<=84',
                                                               ifelse(test_data$mths_since_issue_d > 84, 'mths_since_issue_data>84', NaN))))
##  num [1:88737] 71 71 71 71 71 71 71 71 71 71 ...
test_data$int_rate_factor_trnsfrmd <- ifelse(test_data$int_rate <= 9.548, 'int_rate<9.548',
                                              ifelse(test_data$int_rate <= 12.025, '9.548<int_rate<=12.025',
                                                     ifelse(test_data$int_rate <= 15.74, '12.025<int_rate<=15.74',
                                                            ifelse(test_data$int_rate <= 20.281, '15.74<int_rate<20.281',
                                                                   ifelse(test_data$int_rate >20.281, 'int_rate>20.281',NaN)))))

test_data$funded_amnt_factor_trnsfrmd <- ifelse(test_data$funded_amnt <= 1000, 'funded_amnt:<5K',
                                                 ifelse(test_data$funded_amnt <= 10000, 'funded_amnt:5K-10K',
                                                        ifelse(test_data$funded_amnt <= 20000, 'funded_amnt:10K:20K',
                                                               ifelse(test_data$funded_amnt <= 30000, 'funded_amnt:20K:30:',
                                                                      ifelse(test_data$funded_amnt <= 50000, 'funded_amnt:30K:50K',
                                                                             ifelse(test_data$funded_amnt <= 70000, 'funded_amnt:50K:70K',
                                                                                    ifelse(test_data$funded_amnt <= 100000, 'funded_amnt:70K:100K',
                                                                                           ifelse(test_data$funded_amnt > 100000, 'funded_amnt:>100K',
                                                                                                  NaN))))))))

test_data$mths_since_earliest_cr_line_factor_trnsfrmd <- ifelse(test_data$mths_since_earliest_cr_line <= 140, 'mths_since_earliest_cr_line<=140',
                                                                 ifelse(test_data$mths_since_earliest_cr_line <= 246, '140<mths_since_earliest_cr_line<=246',
                                                                        ifelse(test_data$mths_since_earliest_cr_line <= 270, '246<mths_since_earliest_cr_line<=270',
                                                                               ifelse(test_data$mths_since_earliest_cr_line <= 293, '270<mths_since_earliest_cr_line<=293',
                                                                                      ifelse(test_data$mths_since_earliest_cr_line <= 398, '293<mths_since_earliest_cr_line<=398',
                                                                                             ifelse(test_data$mths_since_earliest_cr_line >398, 'mths_since_earliest_cr_line>398',
                                                                                                    NaN))))))

test_data$delinq_2yrs_trnsfrmd <- ifelse(test_data$delinq_2yrs == 0, 'delinq_2yrs=0',
                                          ifelse(test_data$delinq_2yrs == 1, 'delinq_2yrs=1',
                                                 ifelse(test_data$delinq_2yrs == 2, 'delinq_2yrs=2',
                                                        ifelse(test_data$delinq_2yrs > 2, 'delinq_2yrs>2',
                                                               NaN))))

test_data$inq_last_6mths_trnsfrmd <- ifelse(test_data$inq_last_6mths == 0, 'inq_last_6mths=0',
                                             ifelse(test_data$inq_last_6mths == 1, 'inq_last_6mths=1',
                                                    ifelse(test_data$inq_last_6mths == 2, 'inq_last_6mths=2',
                                                           ifelse(test_data$inq_last_6mths == 3, 'inq_last_6mths=3',
                                                                  ifelse(test_data$inq_last_6mths > 3, 'inq_last_6mths>3',
                                                                         NaN)))))

test_data$open_acc_trnsfrmd <- ifelse(test_data$open_acc %in% c(0,1,2,3,seq(21,max(test_data$open_acc))), 'open_acc=0,1,2,3,21-max',
                                       ifelse(test_data$open_acc %in% c(4,5,6,7), 'open_acc=4,5,6,7',
                                              ifelse(test_data$open_acc %in% c(8), 'open_acc=8',
                                                     ifelse(test_data$open_acc %in% c(9,10,11,12), 'open_acc=9,10,11,12',
                                                            ifelse(test_data$open_acc %in% c(13,14,15,16), 'open_acc=13,14,15,16',
                                                                   ifelse(test_data$open_acc %in% c(17), 'open_acc=17',
                                                                          ifelse(test_data$open_acc %in% c(18,19,20), 'open_acc=18,19,20',
                                                                                 NaN)))))))

test_data$pub_rec_trnsfrmd <- ifelse(test_data$pub_rec == 0, 'pub_rec=0',
                                      ifelse(test_data$pub_rec == 1, 'pub_rec=1',
                                             ifelse(test_data$pub_rec == 2, 'pub_rec=2',
                                                    ifelse(test_data$pub_rec >2, 'pub_rec>=3',
                                                           NaN))))

test_data$total_acc_factor_trnsfrmd <- ifelse(test_data$total_acc <= 10, 'total_acc<=10',
                                               ifelse(test_data$total_acc <= 25, 'total_acc<=25',
                                                      ifelse(test_data$total_acc <= 50, 'total_acc<=50',
                                                             ifelse(test_data$total_acc > 50, 'total_acc>50',
                                                                    NaN))))

test_data$acc_now_delinq_trnsfrmd <- ifelse(test_data$acc_now_delinq == 0, 'acc_now_delinq=0',
                                             ifelse(test_data$acc_now_delinq >= 1, 'acc_now_delinq>=1',
                                                    NaN))

test_data$total_rev_hi_lim_factor_trnsfrmd <- ifelse(test_data$total_rev_hi_lim <= 5000, 'total_rev_hi_lim<=5K',
                                                      ifelse(test_data$total_rev_hi_lim <= 10000, 'total_rev_hi_lim:5K-10K',
                                                             ifelse(test_data$total_rev_hi_lim <= 20000, 'total_rev_hi_lim:10K-20K',
                                                                    ifelse(test_data$total_rev_hi_lim <= 30000, 'total_rev_hi_lim:20K-30K',
                                                                           ifelse(test_data$total_rev_hi_lim <= 40000, 'total_rev_hi_lim:30K-40K',
                                                                                  ifelse(test_data$total_rev_hi_lim <= 50000, 'total_rev_hi_lim:40K-50K',
                                                                                         ifelse(test_data$total_rev_hi_lim <= 100000, 'total_rev_hi_lim:50K-100L',
                                                                                                ifelse(test_data$total_rev_hi_lim > 100000, 'total_rev_hi_lim>100K',
                                                                                                       NaN))))))))


test_data$installment_factor_trnsfrmd <- ifelse(test_data$installment <= 30, 'installment<=30',
                                                 ifelse(test_data$installment <= 100, 'installment:30-100',
                                                        ifelse(test_data$installment <= 200, 'installment:100-200',
                                                               ifelse(test_data$installment <= 300, 'installment:200-300',
                                                                      ifelse(test_data$installment <= 400, 'installment:300-400',
                                                                             ifelse(test_data$installment <= 500, 'installment:400-500',
                                                                                    ifelse(test_data$installment <= 600, 'installment:500:600',
                                                                                           ifelse(test_data$installment <= 700, 'installment:600-700',
                                                                                                  ifelse(test_data$installment <= 800, 'installment:700-800',
                                                                                                         ifelse(test_data$installment > 800, 'installment>800',
                                                                                                                NaN))))))))))

test_data$annual_inc_factor_trnsfrmd <- ifelse(test_data$annual_inc <= 20000, 'annual_inc<=20K',
                                                ifelse(test_data$annual_inc <= 30000, 'annual_inc:20K-30K',
                                                       ifelse(test_data$annual_inc <= 40000, 'annual_inc:30K-40K',
                                                              ifelse(test_data$annual_inc <= 50000, 'annual_inc:40K-50K',
                                                                     ifelse(test_data$annual_inc <= 60000, 'annual_inc:50K-60K',
                                                                            ifelse(test_data$annual_inc <= 70000, 'annual_inc:60K-70K',
                                                                                   ifelse(test_data$annual_inc <= 80000, 'annual_inc:70K-80K',
                                                                                          ifelse(test_data$annual_inc <= 90000, 'annual_inc:80K-90K',
                                                                                                 ifelse(test_data$annual_inc <= 100000, 'annual_inc:90K-100K',
                                                                                                        ifelse(test_data$annual_inc <= 120000, 'annual_inc:100K-120K',
                                                                                                               ifelse(test_data$annual_inc <= 140000, 'annual_inc:120K-140K',
                                                                                                                      ifelse(test_data$annual_inc > 140000, 'annual_inc:>140K',
                                                                                                                             NaN))))))))))))
test_data$mths_since_last_delinq_factor_trnsfrmd <- ifelse(test_data$mths_since_last_delinq <= 3, 'mths_since_last_delinq<=3',
                                                            ifelse(test_data$mths_since_last_delinq <= 7, 'mths_since_last_delinq:4-7',
                                                                   ifelse(test_data$mths_since_last_delinq <= 40, 'mths_since_last_delinq:7-40',
                                                                          ifelse(test_data$mths_since_last_delinq <= 80, 'mths_since_last_delinq:40-80',
                                                                                 ifelse(test_data$mths_since_last_delinq > 80, 'mths_since_last_delinq>80',
                                                                                        ifelse(is.na(test_data$mths_since_last_delinq), 'Missing',
                                                                                               NaN))))))
test_data$dti_factor_trnsfrmd <- ifelse(test_data$dti <= 1, 'dti:<1',
                                         ifelse(test_data$dti <= 2, 'dti:1-2',
                                                ifelse(test_data$dti <= 4, 'dti:2-4',
                                                       ifelse(test_data$dti <= 6, 'dti:4-6',
                                                              ifelse(test_data$dti <= 8, 'dti:6-8',
                                                                     ifelse(test_data$dti <= 10, 'dti:8-10',
                                                                            ifelse(test_data$dti <= 15, 'dti:10-15',
                                                                                   ifelse(test_data$dti <= 20, 'dti:15-20',
                                                                                          ifelse(test_data$dti <= 25, 'dti:20-25',
                                                                                                 ifelse(test_data$dti <= 30, 'dti:25-30',
                                                                                                        ifelse(test_data$dti <= 35, 'dti:30-35',
                                                                                                               ifelse(test_data$dti >35, 'dti:>35',
                                                                                                                      NaN))))))))))))

test_data$mths_since_last_record_factor_trnsfrmd <- ifelse(test_data$mths_since_last_record <= 2, 'mths_since_last_record<=2',
                                                            ifelse(test_data$mths_since_last_record <= 30, 'mths_since_last_record:3-30',
                                                                   ifelse(test_data$mths_since_last_record <= 65, 'mths_since_last_record:31-65',
                                                                          ifelse(test_data$mths_since_last_record <= 80, 'mths_since_last_record:66-80',
                                                                                 ifelse(test_data$mths_since_last_record > 80, 'mths_since_last_record>80',
                                                                                        ifelse(is.na(test_data$mths_since_last_record), 'Missing',
                                                                                               NaN))))))

test_data <- test_data %>%
  mutate(mths_since_last_record_factor_trnsfrmd = replace_na(mths_since_last_record_factor_trnsfrmd, 'Missing'))
test_data <- test_data %>%
  mutate(mths_since_last_delinq_factor_trnsfrmd = replace_na(mths_since_last_delinq_factor_trnsfrmd, 'Missing'))

test_data_trnsfrmd <- test_data %>%
  select(ends_with(c('good_bad', 'trnsfrmd')))

Logistic Regression

We have finally prepared our train and test data for model fitting, but before we proceed with fitting, we inspect IV table and check for variables that have extremely low IV. These variables then can be removed as they are redundant and have very low or no predictive capability. Following is our table of IV"

library(knitr)
kable(IV_df, caption='Information value table')
Information value table
WoE_col IV_value
grade 0.3221226
home_ownership 0.02284384
grade 0.02284384
addr_state 0.01789869
verification_status 0.03591611
purpose 0.05026488
initial_list_status 0.1531024
term_int 0.02082322
emp_length_int 0.009471146
mths_since_issue_d 0.9180439
int_rate_factor 0.5569444
funded_amnt_factor 0.01916893
mths_since_earliest_cr_line 0.01513249
delinq_2yrs 0.001068474
inq_last_mths 0.09256693
open_acc 0.008568512
pub_rec 0.005286657
total_acc_factor 0.01002985
acc_now_delinq 0.0001141764
total_rev_hi_lim_factor 0.08260536
installment_factor 0.02821904
annual_inc_factor 0.0523239
mths_since_last_delinq_factor 0.004776739
dti_factor 0.0131144
mths_since_last_record_factor 0.02531216

Generally, IV values smaller than 0.02 are considered to have no predictive power. However, we will set the bar lower and set IV cutoff of 0.01. This means that any feature that has IV less than 0.01 will be removed from prepared train and test data and hence will not be used for logistic regression. At the same time, lets convert all columns from strings to factors which is de facto datatype to represent categories.

library(stats)
library(broom)
library(tidyverse)
library(dplyr)
library(readxl)
library(stringr)
library(purrr)
library(lubridate)
library(caret)
library(ggplot2)
library(data.table)

train_data_trnsfrmd <- train_data_trnsfrmd %>%
  select(-contains(pull(IV_df %>% filter(IV_value < 0.01) %>% select(WoE_col))))
test_data_trnsfrmd <- test_data_trnsfrmd %>%
  select(-contains(pull(IV_df %>% filter(IV_value < 0.01) %>% select(WoE_col))))

train_data_trnsfrmd <- train_data_trnsfrmd %>% mutate_if(is.character,as.factor)

We are now all set to fit the logistic regression to our train data and extract parameters for retained features of the data,

model <- glm(good_bad ~.,family=binomial(link='logit'),data=train_data_trnsfrmd)

Validation/Performance measurement

After cleaning and processing data, following are performance metrics of logistic regression fit:

Misclssification Error

Misclassification Error is 0.0886

AUROC

AUROC is

plotROC(test_data_trnsfrmd$good_bad, predicted)

Gini an Kolmogorov-Smirnov coefficients

Gini Index, 0.4863805 and Kolmogorov-Smirnov is 0.3628.

Score card calculation

Before we can calculate credit score card, we have to prepare model coefficients table as follows,

test <- dummy.coef(model)
score_card_df <- data.frame(matrix(ncol = 4, nrow = 0))
colnames(score_card_df) <- c('col+category', 'col', 'category', 'regression_coeff')
for (i in seq_along(test)){
  for (j in seq_along(test[[i]])){
    col_category <- paste0(names(test[i]), names(test[[i]][j]))
    correspndng_col <- names(test[i])
    correspndng_category <- names(test[[i]][j])
    reg_coeff <- test[[i]][[j]]
    score_card_df[nrow(score_card_df) + 1,] = list(col_category, correspndng_col, correspndng_category, reg_coeff)
  }
}

p_values_df <- stack(coef(summary(model))[,4])
coeff_df <- stack(coefficients(model))
col_names <- colnames(train_data_trnsfrmd)

We are ready to calculate score card now,

max_sum_coef <- sum(pull(score_card_df %>%
                           group_by(col) %>%
                           summarise(regression_coeff = max(regression_coeff)) %>%
                           select(regression_coeff)))

min_sum_coef <- sum(pull(score_card_df %>%
                           group_by(col) %>%
                           summarise(regression_coeff = min(regression_coeff)) %>%
                           select(regression_coeff)))

max_score <- 850
min_score <- 300
score_card_df <- score_card_df %>%
  mutate(score_calc = regression_coeff * (max_score - min_score) / (max_sum_coef - min_sum_coef)) %>%
  mutate(score_calc = ifelse(category == '(Intercept)',
                             (regression_coeff - min_sum_coef)/(max_sum_coef - min_sum_coef) *
                               (max_score - min_score) + min_score, score_calc)) %>%
  mutate(score_preliminary = round(score_calc))

max_sum_score_prelimnry <- sum(pull(score_card_df %>%
                                      group_by(col) %>%
                                      summarise(score_preliminary = max(score_preliminary)) %>%
                                      select(score_preliminary)))

min_sum_score_prelimnry <- sum(pull(score_card_df %>%
                                      group_by(col) %>%
                                      summarise(score_preliminary = min(score_preliminary)) %>%
                                      select(score_preliminary)))

We also calculate credit score for each customer, based on the score card calculated,

model1 <- model

score_coeff <- score_card_df %>%
  filter(regression_coeff != 0) %>%
  select(col, score_preliminary)

cat_names <- pull(score_coeff, col)
score_coeff <- pull(score_coeff, score_preliminary)
names(score_coeff) <- cat_names

model1$coefficients <- score_coeff
train_data_trnsfrmd <- train_data_trnsfrmd %>%
  mutate(credit_score = predict(model1, train_data_trnsfrmd))

test_data_trnsfrmd <- test_data_trnsfrmd %>%
  mutate(credit_score = predict(model1, test_data_trnsfrmd))


score_card_final <- score_card_df %>%
  select(col, category, regression_coeff, score_preliminary)

colnames(score_card_final) <- c('Feature', 'Category', 'Regression Coefficient', 'Credit Score')

So following table is score card:

kable(score_card_final, caption='Score Card')
Score Card
Feature Category Regression Coefficient Credit Score
(Intercept) (Intercept) 4.8751718 620
grade_trnsfrmd A 0.0000000 0
grade_trnsfrmd B 0.3562979 14
grade_trnsfrmd C 0.5832382 23
grade_trnsfrmd D 0.6986421 28
grade_trnsfrmd E 0.9067396 36
grade_trnsfrmd F 0.9546764 38
grade_trnsfrmd G 0.7543712 30
home_ownership_trnsfrmd MORTGAGE 0.0000000 0
home_ownership_trnsfrmd OWN 0.0069895 0
home_ownership_trnsfrmd RENT_OTHER_NONE_ANY -0.1187902 -5
addr_state_trnsfrmd NaN 0.0000000 0
addr_state_trnsfrmd state_list_1 -3.1855832 -127
addr_state_trnsfrmd state_list_10 -2.8422071 -113
addr_state_trnsfrmd state_list_11 -2.7436093 -109
addr_state_trnsfrmd state_list_12 -2.6018673 -104
addr_state_trnsfrmd state_list_2 -3.1401084 -125
addr_state_trnsfrmd state_list_3 -3.1279773 -125
addr_state_trnsfrmd state_list_4 -3.0983589 -123
addr_state_trnsfrmd state_list_5 -3.1058598 -124
addr_state_trnsfrmd state_list_6 -3.0375841 -121
addr_state_trnsfrmd state_list_7 -3.0309108 -121
addr_state_trnsfrmd state_list_8 -2.9953680 -119
addr_state_trnsfrmd state_list_9 -2.9182484 -116
verification_status_trnsfrmd Not or Source Verified 0.0000000 0
verification_status_trnsfrmd Verified -0.0629930 -3
purpose_trnsfrmd purpose_list_1 0.0000000 0
purpose_trnsfrmd purpose_list_2 0.3056351 12
purpose_trnsfrmd purpose_list_3 0.3466141 14
purpose_trnsfrmd purpose_list_4 0.4280740 17
initial_list_status_trnsfrmd f 0.0000000 0
initial_list_status_trnsfrmd w 0.3485581 14
term_int_trnsfrmd term_36 0.0000000 0
term_int_trnsfrmd term_60 -0.0270510 -1
mths_since_issue_d_trnsfrmd mths_since_issue_data>84 0.0000000 0
mths_since_issue_d_trnsfrmd mths_since_issue_date= 1.0273177 41
mths_since_issue_d_trnsfrmd mths_since_issue_date>=56<=63 0.3240516 13
mths_since_issue_d_trnsfrmd mths_since_issue_date>63<=84 0.0377305 2
int_rate_factor_trnsfrmd 12.025<int_rate<=15.74 0.0000000 0
int_rate_factor_trnsfrmd 15.74<int_rate<20.281 -0.6821086 -27
int_rate_factor_trnsfrmd 9.548<int_rate<=12.025 0.6362081 25
int_rate_factor_trnsfrmd int_rate<9.548 1.5881446 63
int_rate_factor_trnsfrmd int_rate>20.281 -1.3516344 -54
funded_amnt_factor_trnsfrmd funded_amnt:<5K 0.0000000 0
funded_amnt_factor_trnsfrmd funded_amnt:10K:20K -0.1309269 -5
funded_amnt_factor_trnsfrmd funded_amnt:20K:30: -0.0760804 -3
funded_amnt_factor_trnsfrmd funded_amnt:30K:50K 0.0058769 0
funded_amnt_factor_trnsfrmd funded_amnt:5K-10K -0.2454671 -10
mths_since_earliest_cr_line_factor_trnsfrmd 140<mths_since_earliest_cr_line<=246 0.0000000 0
mths_since_earliest_cr_line_factor_trnsfrmd 246<mths_since_earliest_cr_line<=270 -0.0118017 0
mths_since_earliest_cr_line_factor_trnsfrmd 270<mths_since_earliest_cr_line<=293 0.0182333 1
mths_since_earliest_cr_line_factor_trnsfrmd 293<mths_since_earliest_cr_line<=398 0.0118682 0
mths_since_earliest_cr_line_factor_trnsfrmd mths_since_earliest_cr_line<=140 0.0871624 3
mths_since_earliest_cr_line_factor_trnsfrmd mths_since_earliest_cr_line>398 -0.0373978 -1
inq_last_6mths_trnsfrmd inq_last_6mths=0 0.0000000 0
inq_last_6mths_trnsfrmd inq_last_6mths=1 -0.1528066 -6
inq_last_6mths_trnsfrmd inq_last_6mths=2 -0.2981309 -12
inq_last_6mths_trnsfrmd inq_last_6mths=3 -0.4337185 -17
inq_last_6mths_trnsfrmd inq_last_6mths>3 -0.5711087 -23
total_acc_factor_trnsfrmd total_acc<=10 0.0000000 0
total_acc_factor_trnsfrmd total_acc<=25 -0.1011973 -4
total_acc_factor_trnsfrmd total_acc<=50 -0.1370747 -5
total_acc_factor_trnsfrmd total_acc>50 -0.1078410 -4
total_rev_hi_lim_factor_trnsfrmd total_rev_hi_lim:10K-20K 0.0000000 0
total_rev_hi_lim_factor_trnsfrmd total_rev_hi_lim:20K-30K 0.0502214 2
total_rev_hi_lim_factor_trnsfrmd total_rev_hi_lim:30K-40K 0.0564718 2
total_rev_hi_lim_factor_trnsfrmd total_rev_hi_lim:40K-50K 0.1088412 4
total_rev_hi_lim_factor_trnsfrmd total_rev_hi_lim:50K-100L 0.1303267 5
total_rev_hi_lim_factor_trnsfrmd total_rev_hi_lim:5K-10K -0.0345617 -1
total_rev_hi_lim_factor_trnsfrmd total_rev_hi_lim<=5K -0.1350045 -5
total_rev_hi_lim_factor_trnsfrmd total_rev_hi_lim>100K 0.3046205 12
installment_factor_trnsfrmd installment:100-200 0.0000000 0
installment_factor_trnsfrmd installment:200-300 -0.1212032 -5
installment_factor_trnsfrmd installment:30-100 0.1448740 6
installment_factor_trnsfrmd installment:300-400 -0.2405831 -10
installment_factor_trnsfrmd installment:400-500 -0.3412615 -14
installment_factor_trnsfrmd installment:500:600 -0.4149059 -17
installment_factor_trnsfrmd installment:600-700 -0.4346135 -17
installment_factor_trnsfrmd installment:700-800 -0.4475585 -18
installment_factor_trnsfrmd installment<=30 -0.2536697 -10
installment_factor_trnsfrmd installment>800 -0.5732664 -23
annual_inc_factor_trnsfrmd annual_inc:>140K 0.0000000 0
annual_inc_factor_trnsfrmd annual_inc:100K-120K -0.0719114 -3
annual_inc_factor_trnsfrmd annual_inc:120K-140K 0.0097124 0
annual_inc_factor_trnsfrmd annual_inc:20K-30K -0.7327546 -29
annual_inc_factor_trnsfrmd annual_inc:30K-40K -0.6519481 -26
annual_inc_factor_trnsfrmd annual_inc:40K-50K -0.5142317 -20
annual_inc_factor_trnsfrmd annual_inc:50K-60K -0.4453559 -18
annual_inc_factor_trnsfrmd annual_inc:60K-70K -0.3208593 -13
annual_inc_factor_trnsfrmd annual_inc:70K-80K -0.2675392 -11
annual_inc_factor_trnsfrmd annual_inc:80K-90K -0.1755016 -7
annual_inc_factor_trnsfrmd annual_inc:90K-100K -0.1508294 -6
annual_inc_factor_trnsfrmd annual_inc<=20K -0.7749635 -31
dti_factor_trnsfrmd dti:<1 0.0000000 0
dti_factor_trnsfrmd dti:>35 0.4639364 18
dti_factor_trnsfrmd dti:1-2 -0.0313432 -1
dti_factor_trnsfrmd dti:10-15 0.0216819 1
dti_factor_trnsfrmd dti:15-20 -0.0232163 -1
dti_factor_trnsfrmd dti:2-4 0.0662698 3
dti_factor_trnsfrmd dti:20-25 -0.0646567 -3
dti_factor_trnsfrmd dti:25-30 -0.1112251 -4
dti_factor_trnsfrmd dti:30-35 0.0325305 1
dti_factor_trnsfrmd dti:4-6 0.1200488 5
dti_factor_trnsfrmd dti:6-8 0.1163105 5
dti_factor_trnsfrmd dti:8-10 0.0667734 3
mths_since_last_record_factor_trnsfrmd Missing 0.0000000 0
mths_since_last_record_factor_trnsfrmd mths_since_last_record:3-30 0.2355819 9
mths_since_last_record_factor_trnsfrmd mths_since_last_record:31-65 0.4139568 16
mths_since_last_record_factor_trnsfrmd mths_since_last_record:66-80 0.3945855 16
mths_since_last_record_factor_trnsfrmd mths_since_last_record<=2 -0.6925020 -28
mths_since_last_record_factor_trnsfrmd mths_since_last_record>80 -0.1467817 -6

Monitoring

For monitoring we first start with transforming new data the same as we did for train and test data.

Pre-process new data

new_data <- fread("C:\\Asus WebStorage\\nitin.7785@gmail.com\\MySyncFolder\\Credit_Risk_Modelling\\Credit-Risk-Model\\publish_git\\data\\lc_2016_2017.csv")

# General Preprocessing
# remove columns with all NA's
new_data <- new_data %>%
  select_if(~sum(!is.na(.)) > 0)

# convert emp_length column to integers, replace "< 1 year" with 0 and replace NA's with 0
new_data <- new_data %>% 
  mutate(emp_length_int = gsub("< 1 year", "0", emp_length))

new_data$emp_length_int <- map_chr(new_data$emp_length_int, numextract) %>%
  as.double() %>%
  replace_na(0)

# convert term_int column to integers
new_data <- new_data %>%
  mutate(term_int = ifelse(term == '36 months', 36, 60))
distinct(new_data, term_int)
# convert mths_since_earliest_cr_line and mths_since_earliest_cr_line, to date and calculate 
# months upto 01-12-2017
new_data$mths_since_earliest_cr_line <- interval(parse_date_time(new_data$earliest_cr_line, "by"), 
                                                  as.POSIXct("2017-12-01"))%/% months(1)

new_data$mths_since_earliest_cr_line <- ifelse(new_data$mths_since_earliest_cr_line < 0, 
                                                max(new_data$mths_since_earliest_cr_line, na.rm = TRUE), 
                                                new_data$mths_since_earliest_cr_line)


new_data$mths_since_issue_d <- interval(parse_date_time(new_data$issue_d, "by"), 
                                         as.POSIXct("2017-12-01"))%/% months(1)

# preprocessing discrete variables
# since we do not need to create dummy variables for linear regression, we simply note down the discrete variable names
# grade, sub_grade, home_ownership, verification_status, loan_status, purpose, addr_state, initial_list_status


# check for missing values and clean
data_summary = summary(new_data)

# 'Total revolving high credit/ credit limit', so it makes sense that the missing values are equal to funded_amnt.
new_data$total_rev_hi_lim <- ifelse(is.na(new_data$total_rev_hi_lim), 
                                     new_data$funded_amnt, new_data$total_rev_hi_lim)

# missing annual income is replace by mean annual income
new_data$annual_inc <- ifelse(is.na(new_data$annual_inc), 
                               mean(new_data$annual_inc, na.rm = TRUE), new_data$annual_inc)

# for the rest fill Na with 0
new_data$mths_since_earliest_cr_line <- ifelse(is.na(new_data$mths_since_earliest_cr_line), 
                                                0, new_data$mths_since_earliest_cr_line)

new_data$acc_now_delinq <- ifelse(is.na(new_data$acc_now_delinq), 0, new_data$acc_now_delinq)
new_data$total_acc <- ifelse(is.na(new_data$total_acc), 0, new_data$total_acc)
new_data$pub_rec <- ifelse(is.na(new_data$pub_rec), 0, new_data$pub_rec)
new_data$open_acc <- ifelse(is.na(new_data$open_acc), 0, new_data$open_acc)
new_data$inq_last_6mths <- ifelse(is.na(new_data$inq_last_6mths), 0, new_data$inq_last_6mths)
new_data$delinq_2yrs <- ifelse(is.na(new_data$delinq_2yrs), 0, new_data$delinq_2yrs)
new_data$emp_length_int <- ifelse(is.na(new_data$emp_length_int), 0, new_data$emp_length_int)

# remove unwanted columns
keep_columns <- c("id", "grade",                            
                  "home_ownership", "loan_status", "addr_state",
                  "verification_status", "purpose",
                  "initial_list_status", "term_int",
                  "emp_length_int", "mths_since_issue_d",
                  "int_rate", "funded_amnt",
                  "mths_since_earliest_cr_line", "delinq_2yrs",                     
                  "inq_last_6mths", "open_acc",
                  "pub_rec", "total_acc",
                  "acc_now_delinq", "total_rev_hi_lim",
                  "installment", "annual_inc",
                  "mths_since_last_delinq", "dti",
                  "mths_since_last_record")
new_data <- new_data %>%
  select(keep_columns)

#### PD model
# Data preparation
# Dependent variable, good/bad (default) definition
# check each loan_status frequency

# create good_bad (default or not) column from loan_status and observe the frequency of good/bad news
bad_status <- c('Charged Off', 'Default',
                'Does not meet the credit policy. Status:Charged Off',
                'Late (31-120 days)')
new_data$good_bad <- ifelse(new_data$loan_status %in% bad_status, 0, 1)

Transform new data

new_data$grade_trnsfrmd <- new_data$grade

new_data$home_ownership_trnsfrmd <- ifelse(new_data$home_ownership %in% combine_home_ownership, "RENT_OTHER_NONE_ANY", new_data$home_ownership)
state1 <- c("NV", "NE", "IA", "ME")
state2 <- c("HI", "FL", "AL", "LA")
state3 <- c("NY")
state4 <- c("NC", "MD", "NM", "VA", "NJ", "UT", "MO")
state5 <- c("CA")
state6 <- c("AZ", "AR", "MI")
state7 <- c("OK", "TN", "PA")
state8 <- c("DE", "MA", "RI", "KY", "MN", "OH", "SD", "IN", "OR", "GA", "WA", "ID", "WI", "MT")
state9 <- c("TX")
state10 <- c("VT", "CT", "IL", "AK", "SC", "CO", "KS", "MS")
state11 <- c("NH", "WV", "WY")
state12 <- c("DC")
new_data$addr_state_trnsfrmd <- ifelse(new_data$addr_state %in% state1, "state_list_1",
                                        ifelse(new_data$addr_state %in% state2, "state_list_2",
                                               ifelse(new_data$addr_state %in% state3, "state_list_3",
                                                      ifelse(new_data$addr_state %in% state4, "state_list_4",
                                                             ifelse(new_data$addr_state %in% state5, "state_list_5",
                                                                    ifelse(new_data$addr_state %in% state6, "state_list_6",
                                                                           ifelse(new_data$addr_state %in% state7, "state_list_7",
                                                                                  ifelse(new_data$addr_state %in% state8, "state_list_8",
                                                                                         ifelse(new_data$addr_state %in% state9, "state_list_9",
                                                                                                ifelse(new_data$addr_state %in% state10, "state_list_10",
                                                                                                       ifelse(new_data$addr_state %in% state11, "state_list_11",
                                                                                                              ifelse(new_data$addr_state %in% state12, "state_list_12",NaN))))))))))))


new_data$verification_status_trnsfrmd <- ifelse(new_data$verification_status %in% c("Not Verified", "Source Verified"),
                                                 "Not or Source Verified",
                                                 new_data$verification_status)

purpose1 <- c("small_business", "educational", "renewable_energy", "moving")
purpose2 <- c("other", "medical", "house")
purpose3 <- c("debt_consolidation", "vacation", "home_improvement")
purpose4 <- c("wedding", "car", "major_purchase", "credit_card")
new_data$purpose_trnsfrmd <- ifelse(new_data$purpose %in% purpose1, "purpose_list_1",
                                     ifelse(new_data$purpose %in% purpose2, "purpose_list_2",
                                            ifelse(new_data$purpose %in% purpose3, "purpose_list_3",
                                                   ifelse(new_data$purpose %in% purpose4, "purpose_list_4", NaN))))

new_data$initial_list_status_trnsfrmd <- new_data$initial_list_status

new_data$term_int_trnsfrmd <- ifelse(new_data$term_int == 36, 'term_36',
                                      ifelse(new_data$term_int == 60, 'term_60', NaN))

new_data$emp_length_int_trnsfrmd <- ifelse(new_data$emp_length_int %in% c(0), 'emp_length_0',
                                            ifelse(new_data$emp_length_int %in% c(1,2,3,4), 'emp_length_1_4',
                                                   ifelse(new_data$emp_length_int %in% c(5,6), 'emp_length_5_6',
                                                          ifelse(new_data$emp_length_int %in% c(7,8,9), 'emp_length_7_9',
                                                                 'emp_length_10'))))

new_data$mths_since_issue_d_trnsfrmd <- ifelse(new_data$mths_since_issue_d <= 55, paste0('mths_since_issue_date=',str(new_data$mths_since_issue_d)),
                                                ifelse(new_data$mths_since_issue_d %in% c(56,57,58,59,60,61,62,63), 'mths_since_issue_date>=56<=63',
                                                       ifelse(new_data$mths_since_issue_d %in% seq(64,84), 'mths_since_issue_date>63<=84', 
                                                              ifelse(new_data$mths_since_issue_d > 84, 'mths_since_issue_data>84', NaN))))
##  num [1:759338] 5 5 5 5 5 5 5 5 5 5 ...
new_data$int_rate_factor_trnsfrmd <- ifelse(new_data$int_rate <= 9.548, 'int_rate<9.548',
                                             ifelse(new_data$int_rate <= 12.025, '9.548<int_rate<=12.025',
                                                    ifelse(new_data$int_rate <= 15.74, '12.025<int_rate<=15.74',
                                                           ifelse(new_data$int_rate <= 20.281, '15.74<int_rate<20.281',
                                                                  ifelse(new_data$int_rate >20.281, 'int_rate>20.281',NaN)))))

new_data$funded_amnt_factor_trnsfrmd <- ifelse(new_data$funded_amnt <= 1000, 'funded_amnt:<5K',
                                                ifelse(new_data$funded_amnt <= 10000, 'funded_amnt:5K-10K',
                                                       ifelse(new_data$funded_amnt <= 20000, 'funded_amnt:10K:20K',
                                                              ifelse(new_data$funded_amnt <= 30000, 'funded_amnt:20K:30:',
                                                                     ifelse(new_data$funded_amnt <= 50000, 'funded_amnt:30K:50K',
                                                                            ifelse(new_data$funded_amnt <= 70000, 'funded_amnt:50K:70K',
                                                                                   ifelse(new_data$funded_amnt <= 100000, 'funded_amnt:70K:100K',
                                                                                          ifelse(new_data$funded_amnt > 100000, 'funded_amnt:>100K',
                                                                                                 NaN))))))))

new_data$mths_since_earliest_cr_line_factor_trnsfrmd <- ifelse(new_data$mths_since_earliest_cr_line <= 140, 'mths_since_earliest_cr_line<=140',
                                                                ifelse(new_data$mths_since_earliest_cr_line <= 246, '140<mths_since_earliest_cr_line<=246',
                                                                       ifelse(new_data$mths_since_earliest_cr_line <= 270, '246<mths_since_earliest_cr_line<=270',
                                                                              ifelse(new_data$mths_since_earliest_cr_line <= 293, '270<mths_since_earliest_cr_line<=293',
                                                                                     ifelse(new_data$mths_since_earliest_cr_line <= 398, '293<mths_since_earliest_cr_line<=398',
                                                                                            ifelse(new_data$mths_since_earliest_cr_line >398, 'mths_since_earliest_cr_line>398',
                                                                                                   NaN))))))

new_data$delinq_2yrs_trnsfrmd <- ifelse(new_data$delinq_2yrs == 0, 'delinq_2yrs=0',
                                         ifelse(new_data$delinq_2yrs == 1, 'delinq_2yrs=1',
                                                ifelse(new_data$delinq_2yrs == 2, 'delinq_2yrs=2',
                                                       ifelse(new_data$delinq_2yrs > 2, 'delinq_2yrs>2',
                                                              NaN))))

new_data$inq_last_6mths_trnsfrmd <- ifelse(new_data$inq_last_6mths == 0, 'inq_last_6mths=0',
                                            ifelse(new_data$inq_last_6mths == 1, 'inq_last_6mths=1',
                                                   ifelse(new_data$inq_last_6mths == 2, 'inq_last_6mths=2',
                                                          ifelse(new_data$inq_last_6mths == 3, 'inq_last_6mths=3',
                                                                 ifelse(new_data$inq_last_6mths > 3, 'inq_last_6mths>3',
                                                                        NaN)))))

new_data$open_acc_trnsfrmd <- ifelse(new_data$open_acc %in% c(0,1,2,3,seq(21,max(new_data$open_acc))), 'open_acc=0,1,2,3,21-max',
                                      ifelse(new_data$open_acc %in% c(4,5,6,7), 'open_acc=4,5,6,7',
                                             ifelse(new_data$open_acc %in% c(8), 'open_acc=8',
                                                    ifelse(new_data$open_acc %in% c(9,10,11,12), 'open_acc=9,10,11,12',
                                                           ifelse(new_data$open_acc %in% c(13,14,15,16), 'open_acc=13,14,15,16',
                                                                  ifelse(new_data$open_acc %in% c(17), 'open_acc=17',
                                                                         ifelse(new_data$open_acc %in% c(18,19,20), 'open_acc=18,19,20',
                                                                                NaN)))))))

new_data$pub_rec_trnsfrmd <- ifelse(new_data$pub_rec == 0, 'pub_rec=0',
                                     ifelse(new_data$pub_rec == 1, 'pub_rec=1',
                                            ifelse(new_data$pub_rec == 2, 'pub_rec=2',
                                                   ifelse(new_data$pub_rec >2, 'pub_rec>=3',
                                                          NaN))))

new_data$total_acc_factor_trnsfrmd <- ifelse(new_data$total_acc <= 10, 'total_acc<=10',
                                              ifelse(new_data$total_acc <= 25, 'total_acc<=25',
                                                     ifelse(new_data$total_acc <= 50, 'total_acc<=50',
                                                            ifelse(new_data$total_acc > 50, 'total_acc>50',
                                                                   NaN))))

new_data$acc_now_delinq_trnsfrmd <- ifelse(new_data$acc_now_delinq == 0, 'acc_now_delinq=0',
                                            ifelse(new_data$acc_now_delinq >= 1, 'acc_now_delinq>=1',
                                                   NaN))

new_data$total_rev_hi_lim_factor_trnsfrmd <- ifelse(new_data$total_rev_hi_lim <= 5000, 'total_rev_hi_lim<=5K',
                                                     ifelse(new_data$total_rev_hi_lim <= 10000, 'total_rev_hi_lim:5K-10K',
                                                            ifelse(new_data$total_rev_hi_lim <= 20000, 'total_rev_hi_lim:10K-20K',
                                                                   ifelse(new_data$total_rev_hi_lim <= 30000, 'total_rev_hi_lim:20K-30K',
                                                                          ifelse(new_data$total_rev_hi_lim <= 40000, 'total_rev_hi_lim:30K-40K',
                                                                                 ifelse(new_data$total_rev_hi_lim <= 50000, 'total_rev_hi_lim:40K-50K',
                                                                                        ifelse(new_data$total_rev_hi_lim <= 100000, 'total_rev_hi_lim:50K-100L',
                                                                                               ifelse(new_data$total_rev_hi_lim > 100000, 'total_rev_hi_lim>100K',
                                                                                                      NaN))))))))


new_data$installment_factor_trnsfrmd <- ifelse(new_data$installment <= 30, 'installment<=30',
                                                ifelse(new_data$installment <= 100, 'installment:30-100',
                                                       ifelse(new_data$installment <= 200, 'installment:100-200',
                                                              ifelse(new_data$installment <= 300, 'installment:200-300',
                                                                     ifelse(new_data$installment <= 400, 'installment:300-400',
                                                                            ifelse(new_data$installment <= 500, 'installment:400-500',
                                                                                   ifelse(new_data$installment <= 600, 'installment:500:600',
                                                                                          ifelse(new_data$installment <= 700, 'installment:600-700',
                                                                                                 ifelse(new_data$installment <= 800, 'installment:700-800',
                                                                                                        ifelse(new_data$installment > 800, 'installment>800',
                                                                                                               NaN))))))))))

new_data$annual_inc_factor_trnsfrmd <- ifelse(new_data$annual_inc <= 20000, 'annual_inc<=20K',
                                               ifelse(new_data$annual_inc <= 30000, 'annual_inc:20K-30K',
                                                      ifelse(new_data$annual_inc <= 40000, 'annual_inc:30K-40K',
                                                             ifelse(new_data$annual_inc <= 50000, 'annual_inc:40K-50K',
                                                                    ifelse(new_data$annual_inc <= 60000, 'annual_inc:50K-60K',
                                                                           ifelse(new_data$annual_inc <= 70000, 'annual_inc:60K-70K',
                                                                                  ifelse(new_data$annual_inc <= 80000, 'annual_inc:70K-80K',
                                                                                         ifelse(new_data$annual_inc <= 90000, 'annual_inc:80K-90K',
                                                                                                ifelse(new_data$annual_inc <= 100000, 'annual_inc:90K-100K',
                                                                                                       ifelse(new_data$annual_inc <= 120000, 'annual_inc:100K-120K',
                                                                                                              ifelse(new_data$annual_inc <= 140000, 'annual_inc:120K-140K',
                                                                                                                     ifelse(new_data$annual_inc > 140000, 'annual_inc:>140K',
                                                                                                                            NaN))))))))))))
new_data$mths_since_last_delinq_factor_trnsfrmd <- ifelse(new_data$mths_since_last_delinq <= 3, 'mths_since_last_delinq<=3',
                                                           ifelse(new_data$mths_since_last_delinq <= 7, 'mths_since_last_delinq:4-7',
                                                                  ifelse(new_data$mths_since_last_delinq <= 40, 'mths_since_last_delinq:7-40',
                                                                         ifelse(new_data$mths_since_last_delinq <= 80, 'mths_since_last_delinq:40-80',
                                                                                ifelse(new_data$mths_since_last_delinq > 80, 'mths_since_last_delinq>80',
                                                                                       ifelse(is.na(new_data$mths_since_last_delinq), 'Missing',
                                                                                              NaN))))))
new_data$dti_factor_trnsfrmd <- ifelse(new_data$dti <= 1, 'dti:<1',
                                        ifelse(new_data$dti <= 2, 'dti:1-2',
                                               ifelse(new_data$dti <= 4, 'dti:2-4',
                                                      ifelse(new_data$dti <= 6, 'dti:4-6',
                                                             ifelse(new_data$dti <= 8, 'dti:6-8',
                                                                    ifelse(new_data$dti <= 10, 'dti:8-10',
                                                                           ifelse(new_data$dti <= 15, 'dti:10-15',
                                                                                  ifelse(new_data$dti <= 20, 'dti:15-20',
                                                                                         ifelse(new_data$dti <= 25, 'dti:20-25',
                                                                                                ifelse(new_data$dti <= 30, 'dti:25-30',
                                                                                                       ifelse(new_data$dti <= 35, 'dti:30-35',
                                                                                                              ifelse(new_data$dti >35, 'dti:>35',
                                                                                                                     NaN))))))))))))

new_data$mths_since_last_record_factor_trnsfrmd <- ifelse(new_data$mths_since_last_record <= 2, 'mths_since_last_record<=2',
                                                           ifelse(new_data$mths_since_last_record <= 30, 'mths_since_last_record:3-30',
                                                                  ifelse(new_data$mths_since_last_record <= 65, 'mths_since_last_record:31-65',
                                                                         ifelse(new_data$mths_since_last_record <= 80, 'mths_since_last_record:66-80',
                                                                                ifelse(new_data$mths_since_last_record > 80, 'mths_since_last_record>80',
                                                                                       ifelse(is.na(new_data$mths_since_last_record), 'Missing',
                                                                                              NaN))))))

new_data <- new_data %>%
  mutate(mths_since_last_record_factor_trnsfrmd = replace_na(mths_since_last_record_factor_trnsfrmd, 'Missing'))
new_data <- new_data %>%
  mutate(mths_since_last_delinq_factor_trnsfrmd = replace_na(mths_since_last_delinq_factor_trnsfrmd, 'Missing'))

new_data_trnsfrmd <- new_data %>%
  select(ends_with(c('good_bad', 'trnsfrmd')))

Calculate frequency of categories

Looking at the AUROC, we see that we have a model that predicts probability of default and credit score to an acceptable level of satisfaction. However, we need to remember that as the new data arrives our current model may grow out-of-date, as it has not been trained on new data which might be substantially different from the old data. In order to assess if we need to update our model, we need to quantify how different new data is from old data, on which model was trained. We do this by “Population Stability Index” (PSI) which tells us how much each feature of the new data is different from old data. To calculate PSI, we first compute proportion of each category in each feature from new and old data,

# categorizing credit scores

new_data_trnsfrmd <- new_data_trnsfrmd %>%
  mutate(credit_score = predict(model1, new_data_trnsfrmd))

new_data_trnsfrmd <- new_data_trnsfrmd %>%
  select(-contains(pull(IV_df %>% filter(IV_value < 0.01) %>% select(WoE_col))))


train_data_trnsfrmd$score_cat <- cut(train_data_trnsfrmd$credit_score, seq(300,850,50))
new_data_trnsfrmd$score_cat <- cut(new_data_trnsfrmd$credit_score, seq(300,850,50))

# calculate proportions of each category in each feature in train data
num_features <- length(colnames(train_data_trnsfrmd)) - 2
col_names <- colnames(train_data_trnsfrmd)
train_categories_freq <- data.frame()
# colnames(train_categories_freq) <- c('categories', 'freq', 'column_name')
for (col_index in c(2:(num_features), num_features + 2)){
  test <- train_data_trnsfrmd[, ..col_index] %>%
    group_by_all() %>%
    summarise(freq_train = n()) %>%
    mutate(column_name = col_names[col_index]) %>%
    rename(categories = 1)
  train_categories_freq <- train_categories_freq %>%
    bind_rows(test)
}

# calculate proportions of each category in each feature in new data
num_features <- length(colnames(new_data_trnsfrmd)) - 2
col_names <- colnames(new_data_trnsfrmd)
new_categories_freq <- data.frame()
for (col_index in c(2:(num_features), num_features + 2)){
  test <- new_data_trnsfrmd[, ..col_index] %>%
    group_by_all() %>%
    summarise(freq_new = n()) %>%
    mutate(column_name = col_names[col_index]) %>%
    rename(categories = 1)
  new_categories_freq <- new_categories_freq %>%
    bind_rows(test)
}

PSI_calc <- train_categories_freq %>%
  left_join(new_categories_freq) %>%
  mutate(freq_train = freq_train / nrow(train_data_trnsfrmd),
         freq_new = freq_new / nrow(new_data_trnsfrmd)) %>%
  ungroup() %>%
  mutate(freq_train = ifelse(is.na(freq_train), 0, freq_train),
         freq_new = ifelse(is.na(freq_new), 0, freq_new))

Population stability index

Finally we can join new and old data categories frequencies and calculate PSI per feature,

freq_table <- PSI_calc <- PSI_calc %>%
  mutate(contribution = ifelse(freq_new == 0 | freq_train == 0, 0,
                               (freq_new-freq_train)*log(freq_new/freq_train)))

PSI_calc <- PSI_calc %>%
  mutate(contribution = ifelse(freq_new == 0 | freq_train == 0, 0,
                               (freq_new-freq_train)*log(freq_new/freq_train))) %>%
  group_by(column_name) %>%
  summarise(contribution = sum(contribution)) %>%
  mutate(column_name = str_replace(column_name, "_trnsfrmd", ""))

colnames(PSI_calc) <- c('Feature', 'PSI Contribution')

kable(PSI_calc, caption='Population Stability Index Table')
Population Stability Index Table
Feature PSI Contribution
addr_state 0.0084278
annual_inc_factor 0.0102393
dti_factor 0.0049768
funded_amnt_factor 0.0158914
grade 0.0256307
home_ownership 0.0042154
initial_list_status 0.3472448
inq_last_6mths 0.0286351
installment_factor 0.0209305
int_rate_factor 0.0557600
mths_since_earliest_cr_line_factor 0.0859222
mths_since_issue_d 0.0192241
mths_since_last_record_factor 0.0123267
purpose 0.0089714
score_cat 0.1488427
term_int 0.0076333
total_acc_factor 0.0105239
total_rev_hi_lim_factor 0.0244212
verification_status 0.0115202

The rule of thumb for PSI’s impact is:

PSI value Conclusion
PSI < 0.1 No change. You can continue using existing model.
PSI >=0.1 but less than 0.2 - Slight change is required.
PSI >=0.2 Significant change is required. Ideally, you should not use this model any more.

Based on this table, we conclude that initial_list_status_trnsfrmd with PSI contribution of 0.3466538 has changed subtantially. Hence, let us examine this closely to see what exactly has changed.

kable(freq_table %>% filter(column_name == 'initial_list_status_trnsfrmd'), caption='Population Stability Index Table')
Population Stability Index Table
categories freq_train column_name freq_new contribution
f 0.5150067 initial_list_status_trnsfrmd 0.2350205 0.2196512
w 0.4849933 initial_list_status_trnsfrmd 0.7649795 0.1275936

Notice the proportion of w and f are quite different for old and new data, but this represent the change in the strategy of the bank rather than change in the population. Hence, it is not a cause to change the model.

The second variable that has relatively higher PSI contribution is score_cat, which is categorised score. This has a value of 0.1480015, meaning a slight change of model is needed, in order to apply model to the new data.

LGD Modelling

Preparing data and modelling LGD

To calculate Expected loss of a loan portfolio we need 3 quantities, 1) Probability of Default, 2) Loss Given Default and 3) Exposure at Default. We have already calculated PD and now we can turn to calculating the other two, i.e., LGD and EAD.

In order to calculate LGD and EAD we first have to filter on loan_status: ‘Charged Off’ and ‘Does not meet the credit policy. Status:Charged Off’, as these are the loans that have had enough time for recoveries to have been completed. We then calculate recovery rate and Credit Conversion Factor (CCF) for these loans, divide the data into train and test and fit the model. As LGD’s can any value between 0 and 1, they are generally modeled using Beta Regression, instead of logistic regression used for PD.

library(betareg)
library(MLmetrics)
# concatenate train and test data to get full default data
full_data <- train_data %>%
  bind_rows(test_data)

# calculate recovery rate and ccf
default_data <- full_data %>%
  filter(loan_status %in% c('Charged Off','Does not meet the credit policy. Status:Charged Off')) %>%
  mutate(mths_since_last_delinq = replace_na(mths_since_last_delinq, 0),
         mths_since_last_record = replace_na(mths_since_last_record, 0),
         recovery_rate = recoveries / funded_amnt) %>%
  mutate(recovery_rate = ifelse(recovery_rate <= 0, 0.001,
                                ifelse(recovery_rate >= 1, 0.999,
                                       recovery_rate))) %>%
  mutate(ccf = (funded_amnt - total_rec_prncp) / funded_amnt)

# divide full default data into training and testing datasets.
train_index <- createDataPartition(default_data$recovery_rate,p=0.8,list=FALSE)
train_default <- default_data[train_index,]
test_default <- default_data[-train_index,]

# list of relevant fields and keep only these columns
default_cols <- c('grade',
                  'home_ownership',
                  'verification_status',
                  'purpose',
                  'initial_list_status',
                  'term_int',
                  'emp_length_int',
                  'mths_since_issue_d',
                  'mths_since_earliest_cr_line',
                  'funded_amnt',
                  'int_rate',
                  'installment',
                  'annual_inc',
                  'dti',
                  'delinq_2yrs',
                  'inq_last_6mths',
                  'mths_since_last_delinq',
                  'mths_since_last_record',
                  'open_acc',
                  'pub_rec',
                  'total_acc',
                  'acc_now_delinq',
                  'total_rev_hi_lim',
                  'recovery_rate')

# convert character columns into factors.
lgd_input_train <- train_default %>%
  select(all_of(default_cols)) %>%
  mutate_if(is.character,as.factor)

lgd_input_test <- test_default %>%
  select(all_of(default_cols)) %>%
  mutate_if(is.character,as.factor)

# Beta regression
lgd_model <- betareg(recovery_rate ~., data=lgd_input_train)

Checking model for over/underfitting

After fitting our model with Beta regression, we predict LGD on both training and testing data using coefficients obtained from fitting our data. The reason we are predicting LGD for training set also has to do with checking if our model is over/underfitting the data. We calculate RMSE and MSE for both training and testing predictions vs actual numbers. If these quantities are close enough for training and testing predictions, this means that our model does not suffer from over/underfitting.

predict_test <- predict(lgd_model, lgd_input_test, type = 'response')
predict_train <- predict(lgd_model, lgd_input_train, type = 'response')

print(paste0("MSE for testing dataset is: ", MSE(predict_test, lgd_input_test$recovery_rate)))
## [1] "MSE for testing dataset is: 0.00782598568596409"
print(paste0("RMSE for testing dataset is: ", RMSE(predict_test, lgd_input_test$recovery_rate)))
## [1] "RMSE for testing dataset is: 0.0884646013158037"
print(paste0("MSE for training dataset is: ", MSE(predict_train, lgd_input_train$recovery_rate)))
## [1] "MSE for training dataset is: 0.00773418256727992"
print(paste0("RMSE for training dataset is: ", RMSE(predict_train, lgd_input_train$recovery_rate)))
## [1] "RMSE for training dataset is: 0.0879442014420503"

As we can see that MSE and RMSE for testing and training set are quite close to each other, our model has done as good fitting as possible.

EAD modelling

Preparing data and modelling EAD

The majority of steps for modelling EAD are same as LGD with two key differences: 1) The target variable to fit is ccf instead of recovery_rate. 2) To model EAD we use Generalised Linear Regression instead of Beta regression as done for LGD.

default_cols <- c('grade',
                  'home_ownership',
                  'verification_status',
                  'purpose',
                  'initial_list_status',
                  'term_int',
                  'emp_length_int',
                  'mths_since_issue_d',
                  'mths_since_earliest_cr_line',
                  'funded_amnt',
                  'int_rate',
                  'installment',
                  'annual_inc',
                  'dti',
                  'delinq_2yrs',
                  'inq_last_6mths',
                  'mths_since_last_delinq',
                  'mths_since_last_record',
                  'open_acc',
                  'pub_rec',
                  'total_acc',
                  'acc_now_delinq',
                  'total_rev_hi_lim',
                  'ccf')
ead_input_train <- train_default %>%
  select(all_of(default_cols)) %>%
  mutate_if(is.character,as.factor)

ead_input_test <- test_default %>%
  select(all_of(default_cols)) %>%
  mutate_if(is.character,as.factor)

sapply(ead_input_train, function(y) sum(length(which(is.na(y)))))
##                       grade              home_ownership 
##                           0                           0 
##         verification_status                     purpose 
##                           0                           0 
##         initial_list_status                    term_int 
##                           0                           0 
##              emp_length_int          mths_since_issue_d 
##                           0                           0 
## mths_since_earliest_cr_line                 funded_amnt 
##                           0                           0 
##                    int_rate                 installment 
##                           0                           0 
##                  annual_inc                         dti 
##                           0                           0 
##                 delinq_2yrs              inq_last_6mths 
##                           0                           0 
##      mths_since_last_delinq      mths_since_last_record 
##                           0                           0 
##                    open_acc                     pub_rec 
##                           0                           0 
##                   total_acc              acc_now_delinq 
##                           0                           0 
##            total_rev_hi_lim                         ccf 
##                           0                           0
# ead_model <- glm(ccf ~., family = binomial(link='logit'), data=ead_input_train)
ead_model <- glm(ccf ~., data=ead_input_train)

Checking model for over/underfitting

Again we check our model for over/underfitting bias.

predict_test <- predict(ead_model, ead_input_test, type = 'response')
predict_train <- predict(ead_model, ead_input_train, type = 'response')

print(paste0("MSE for testing dataset is: ", MSE(predict_test, ead_input_test$ccf)))
## [1] "MSE for testing dataset is: 0.0274414936721869"
print(paste0("RMSE for testing dataset is: ", RMSE(predict_test, ead_input_test$ccf)))
## [1] "RMSE for testing dataset is: 0.165654742377593"
print(paste0("MSE for training dataset is: ", MSE(predict_train, ead_input_train$ccf)))
## [1] "MSE for training dataset is: 0.0283002547548188"
print(paste0("RMSE for training dataset is: ", RMSE(predict_train, ead_input_train$ccf)))
## [1] "RMSE for training dataset is: 0.168226795591008"