# Installing the libraries with the specified version.
#!pip install numpy==1.25.2 pandas==1.5.3 matplotlib==3.7.1 seaborn==0.13.1 scikit-learn==1.2.2 sklearn-pandas==2.2.0 -q --user

# to load and manipulate data
import pandas as pd
import numpy as np

# to visualize data
import matplotlib.pyplot as plt
import seaborn as sns

# Removes the limit for the number of displayed columns
pd.set_option("display.max_columns", None)
# Sets the limit for the number of displayed rows
pd.set_option("display.max_rows", 200)

# to split data into training and test sets
from sklearn.model_selection import train_test_split

# to build decision tree model
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree

from sklearn.metrics import f1_score, make_scorer

# to tune different models
from sklearn.model_selection import GridSearchCV

# to compute classification metrics
from sklearn.metrics import (
    confusion_matrix,
    accuracy_score,
    recall_score,
    precision_score,
    f1_score,
)

# To ignore unnecessary warnings
import warnings
warnings.filterwarnings("ignore")

# uncomment and run the following line if using Google Colab
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive

# loading data into a pandas dataframe
loan = pd.read_csv("/content/drive/MyDrive/Personal/UT Austin/Project 2 - Machine Learning/Loan_Modelling.csv")

# creating a copy of the data
data = loan.copy()

# Identify the columns which are numeric features and which are category features
# Exclude ZIPCode

num_features = ['Age','Experience','Income','CCAvg','Mortgage','Family']
cat_features = ['Securities Account','CD Account','Online','CreditCard','Education']

#Show how many unique values of ZipCode
data['ZIPCode'].nunique()

467

data.head(5)

data.tail(5)

data.shape

(5000, 12)

data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 12 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Age                 5000 non-null   int64  
 1   Experience          5000 non-null   int64  
 2   Income              5000 non-null   int64  
 3   Family              5000 non-null   int64  
 4   CCAvg               5000 non-null   float64
 5   Education           5000 non-null   int64  
 6   Mortgage            5000 non-null   int64  
 7   Personal_Loan       5000 non-null   int64  
 8   Securities_Account  5000 non-null   int64  
 9   CD_Account          5000 non-null   int64  
 10  Online              5000 non-null   int64  
 11  CreditCard          5000 non-null   int64  
dtypes: float64(1), int64(11)
memory usage: 468.9 KB

data.describe(include="all")

# checking for null values
data.isnull().sum()

# checking for duplicate values
data.duplicated().sum()

13

def histogram_boxplot(data, feature, figsize=(15, 10), kde=False, bins=None):
    """
    Boxplot and histogram combined
    data: dataframe
    feature: dataframe column
    figsize: size of figure (default (15,10))
    kde: whether to show the density curve (default False)
    bins: number of bins for histogram (default None)
    """
    f2, (ax_box2, ax_hist2) = plt.subplots(
        nrows=2,  # Number of rows of the subplot grid= 2
        sharex=True,  # x-axis will be shared among all subplots
        gridspec_kw={"height_ratios": (0.25, 0.75)},
        figsize=figsize,
    )  # creating the 2 subplots
    sns.boxplot(
        data=data, x=feature, ax=ax_box2, showmeans=True, color="violet"
    )  # boxplot will be created and a triangle will indicate the mean value of the column
    sns.histplot(
        data=data, x=feature, kde=kde, ax=ax_hist2, bins=bins
    ) if bins else sns.histplot(
        data=data, x=feature, kde=kde, ax=ax_hist2
    )  # For histogram
    ax_hist2.axvline(
        data[feature].mean(), color="green", linestyle="--"
    )  # Add mean to the histogram
    ax_hist2.axvline(
        data[feature].median(), color="black", linestyle="-"
    )  # Add median to the h

# function to create labeled barplots
def labeled_barplot(data, feature, perc=False, n=None):
    """
    Barplot with percentage at the top
    data: dataframe
    feature: dataframe column
    perc: whether to display percentages instead of count (default is False)
    n: displays the top n category levels (default is None, i.e., display all levels)
    """
    total = len(data[feature])  # length of the column
    count = data[feature].nunique()
    if n is None:
        plt.figure(figsize=(count + 2, 6))
    else:
        plt.figure(figsize=(n + 2, 6))
    plt.xticks(rotation=90, fontsize=15)
    ax = sns.countplot(
        data=data,
        x=feature,
        palette="Paired",
        order=data[feature].value_counts().index[:n],
    )
    for p in ax.patches:
        if perc == True:
            label = "{:.1f}%".format(
                100 * p.get_height() / total
            )  # percentage of each class of the category
        else:
            label = p.get_height()  # count of each level of the category
        x = p.get_x() + p.get_width() / 2  # width of the plot
        y = p.get_height()  # height of the plot
        ax.annotate(
            label,
            (x, y),
            ha="center",
            va="center",
            size=12,
            xytext=(0, 5),
            textcoords="offset points",
        )  # annotate the percentage
    plt.show()  # show the plot

def stacked_barplot(data, predictor, target):
    """
    Print the category counts and plot a stacked bar chart
    data: dataframe
    predictor: independent variable
    target: target variable
    """
    count = data[predictor].nunique()
    sorter = data[target].value_counts().index[-1]
    tab1 = pd.crosstab(data[predictor], data[target], margins=True).sort_values(
        by=sorter, ascending=False
    )
    print(tab1)
    print("-" * 120)
    tab = pd.crosstab(data[predictor], data[target], normalize="index").sort_values(
        by=sorter, ascending=False
    )
    tab.plot(kind="bar", stacked=True, figsize=(count + 5, 5))
    plt.legend(
        loc="lower left", frameon=False,
    )
    plt.legend(loc="upper left", bbox_to_anchor=(1, 1))
    plt.show()

### function to plot distributions wrt target
def distribution_plot_wrt_target(data, predictor, target):
    fig, axs = plt.subplots(2, 2, figsize=(12, 10))
    target_uniq = data[target].unique()
    axs[0, 0].set_title("Distribution of target for target=" + str(target_uniq[0]))
    sns.histplot(
        data=data[data[target] == target_uniq[0]],
        x=predictor,
        kde=True,
        ax=axs[0, 0],
        color="teal",
        stat="density",
    )
    axs[0, 1].set_title("Distribution of target for target=" + str(target_uniq[1]))
    sns.histplot(
        data=data[data[target] == target_uniq[1]],
        x=predictor,
        kde=True,
        ax=axs[0, 1],
        color="orange",
        stat="density",
    )
    axs[1, 0].set_title("Boxplot w.r.t target")
    sns.boxplot(data=data, x=target, y=predictor, ax=axs[1, 0], palette="gist_rainbow")
    axs[1, 1].set_title("Boxplot (without outliers) w.r.t target")
    sns.boxplot(
        data=data,
        x=target,
        y=predictor,
        ax=axs[1, 1],
        showfliers=False,
        palette="gist_rainbow",
    )
    plt.tight_layout()
    plt.show()

# Histogram for Mortgage
sns.histplot(data=data, x='Mortgage', kde=True);

# count of rows where Mortgage is 0
zero_count = data[data['Mortgage'] == 0].shape[0]

# count of rows where Mortgage is not 0
nonzero_count = data[data['Mortgage'] != 0].shape[0]

total_rows = data.shape[0]

print(f"Number of customers without a mortgage: {zero_count} ({zero_count / total_rows * 100:.0f}%)")
print(f"Number of customers with a mortgage: {nonzero_count} ({nonzero_count / total_rows * 100:.0f}%)")

Number of customers without a mortgage: 3462 (69%)
Number of customers with a mortgage: 1538 (31%)

#Let's do a box plot of just those customers that have a mortgage
sns.boxplot(data=data[data['Mortgage'] != 0], x='Mortgage');

# Checking how many customers have a credit card from another bank (1=Yes, 0=No)
print(100*data['CreditCard'].value_counts(normalize=False), '\n')
print(100*data['CreditCard'].value_counts(normalize=True), '\n')

# plotting the count plot for Approval
sns.countplot(data=data, x='CreditCard');

CreditCard
0    353000
1    147000
Name: count, dtype: int64 

CreditCard
0    70.6
1    29.4
Name: proportion, dtype: float64

# Scatter plot matrix
plt.figure(figsize=(12, 8))
sns.pairplot(data, vars=num_features, hue='Personal_Loan', diag_kind='kde');

<Figure size 1200x800 with 0 Axes>

# creating a crosstab for Personal_Loan vs Securities_Account
tab = pd.crosstab(
    data['Securities_Account'],
    data['Personal_Loan'],
    normalize='index'    # normalizing by dividing each row by its row total
).sort_values(by=0, ascending=False)    # sorting the resulting crosstab


# Plot the stacked bar chart
tab.plot(kind='bar', stacked=True, figsize=(7, 5))    # creating a stacked bar chart from the normalized crosstab
plt.xlabel('Securities_Account')
plt.legend(loc='upper left', bbox_to_anchor=(1, 1), title='Personal_Loan');    # adding a legend for the 'Personal_Loan' column

# creating a crosstab for Personal_Loan vs CD_Account
tab = pd.crosstab(
    data['CD_Account'],
    data['Personal_Loan'],
    normalize='index'    # normalizing by dividing each row by its row total
).sort_values(by=0, ascending=False)    # sorting the resulting crosstab


# Plot the stacked bar chart
tab.plot(kind='bar', stacked=True, figsize=(7, 5))    # creating a stacked bar chart from the normalized crosstab
plt.xlabel('CD_Account')
plt.legend(loc='upper left', bbox_to_anchor=(1, 1), title='Personal_Loan');    # adding a legend for the 'Personal_Loan' column

# creating a crosstab for Personal_Loan vs Online
tab = pd.crosstab(
    data['Online'],
    data['Personal_Loan'],
    normalize='index'    # normalizing by dividing each row by its row total
).sort_values(by=0, ascending=False)    # sorting the resulting crosstab


# Plot the stacked bar chart
tab.plot(kind='bar', stacked=True, figsize=(7, 5))    # creating a stacked bar chart from the normalized crosstab
plt.xlabel('Online')
plt.legend(loc='upper left', bbox_to_anchor=(1, 1), title='Personal_Loan');    # adding a legend for the 'Personal_Loan' column

# creating a crosstab for Personal_Loan vs CreditCard
tab = pd.crosstab(
    data['CreditCard'],
    data['Personal_Loan'],
    normalize='index'    # normalizing by dividing each row by its row total
).sort_values(by=0, ascending=False)    # sorting the resulting crosstab


# Plot the stacked bar chart
tab.plot(kind='bar', stacked=True, figsize=(7, 5))    # creating a stacked bar chart from the normalized crosstab
plt.xlabel('CreditCard')
plt.legend(loc='upper left', bbox_to_anchor=(1, 1), title='Personal_Loan');    # adding a legend for the 'Personal_Loan' column

#Also let's look at the correlation of the numeric variables
# defining the size of the plot
plt.figure(figsize=(12, 7))

# plotting the heatmap for correlation
sns.heatmap(
    data[num_features].corr(),annot=True, vmin=-1, vmax=1, fmt=".2f", cmap="Spectral"
);

# Let's look at the average loan purchase rate by age

# Plotting
plt.figure(figsize=(10, 6))
sns.lineplot(data=data, x='Age', y='Personal_Loan')
plt.title('Loan Purchase Rate by Age with Confidence Intervals')
plt.xlabel('Age')
plt.ylabel('Purchase Rate')
plt.show()

# creating a crosstab for Personal_Loan vs Education
tab = pd.crosstab(
    data['Education'],
    data['Personal_Loan'],
    normalize='index'    # normalizing by dividing each row by its row total
).sort_values(by=0, ascending=False)    # sorting the resulting crosstab


# Plot the stacked bar chart
tab.plot(kind='bar', stacked=True, figsize=(7, 5))    # creating a stacked bar chart from the normalized crosstab
plt.xlabel('Education')
plt.legend(loc='upper left', bbox_to_anchor=(1, 1), title='Personal_Loan');    # adding a legend for the 'Personal_Loan' column

#Plot a frequency count for each value of Zip Code and color hue by Personal_Loan
plt.figure(figsize=(15, 6))
sns.countplot(data=data, x='ZIPCode', hue='Personal_Loan');

# As a further check, let's do a correlation analysis between Zip Code and Personal Loan
X = data[['ZIPCode']]
y = data['Personal_Loan']

# Calculate the Pearson's correlation coefficient
correlation = X.corrwith(y, method='pearson')

print("Correlation between Zip Code and Personal Loan:", correlation)

# Calculate the Spearman's rank correlation
correlation_spearman = X.corrwith(y, method='spearman')

print("Spearman's rank correlation between Zip Code and Personal Loan:", correlation_spearman)

Correlation between Zip Code and Personal Loan: ZIPCode   -0.002974
dtype: float64
Spearman's rank correlation between Zip Code and Personal Loan: ZIPCode   -0.00028
dtype: float64

# creating a crosstab for Personal_Loan vs Family
tab = pd.crosstab(
    data['Family'],
    data['Personal_Loan'],
    normalize='index'    # normalizing by dividing each row by its row total
).sort_values(by=0, ascending=False)    # sorting the resulting crosstab


# Plot the stacked bar chart
tab.plot(kind='bar', stacked=True, figsize=(7, 5))    # creating a stacked bar chart from the normalized crosstab
plt.xlabel('Family')
plt.legend(loc='upper left', bbox_to_anchor=(1, 1), title='Personal_Loan');    # adding a legend for the 'Personal_Loan' column

histogram_boxplot(data, 'Age')

histogram_boxplot(data, 'Experience')

histogram_boxplot(data, 'Income')

histogram_boxplot(data, 'CCAvg')

histogram_boxplot(data, 'Mortgage')

labeled_barplot(data, 'Family')

labeled_barplot(data, 'Education')

labeled_barplot(data, 'Securities_Account')

labeled_barplot(data, 'CD_Account')

labeled_barplot(data, 'Online')

labeled_barplot(data, 'CreditCard')

cols_list = num_features

plt.figure(figsize=(12, 7))
sns.heatmap(
    data[cols_list].corr(), annot=True, vmin=-1, vmax=1, fmt=".2f", cmap="Spectral"
)
plt.show()

distribution_plot_wrt_target(data, 'Age', 'Personal_Loan')

distribution_plot_wrt_target(data, 'Experience', 'Personal_Loan')

distribution_plot_wrt_target(data, 'Income', 'Personal_Loan')

distribution_plot_wrt_target(data, 'CCAvg', 'Personal_Loan')

distribution_plot_wrt_target(data, 'Mortgage', 'Personal_Loan')

stacked_barplot(data, 'Family', 'Personal_Loan')

Personal_Loan     0    1   All
Family                        
All            4520  480  5000
4              1088  134  1222
3               877  133  1010
1              1365  107  1472
2              1190  106  1296
------------------------------------------------------------------------------------------------------------------------

stacked_barplot(data, 'Education', 'Personal_Loan')

Personal_Loan     0    1   All
Education                     
All            4520  480  5000
3              1296  205  1501
2              1221  182  1403
1              2003   93  2096
------------------------------------------------------------------------------------------------------------------------

stacked_barplot(data, 'Securities_Account', 'Personal_Loan')

Personal_Loan          0    1   All
Securities_Account                 
All                 4520  480  5000
0                   4058  420  4478
1                    462   60   522
------------------------------------------------------------------------------------------------------------------------

stacked_barplot(data, 'CD_Account', 'Personal_Loan')

Personal_Loan     0    1   All
CD_Account                    
All            4520  480  5000
0              4358  340  4698
1               162  140   302
------------------------------------------------------------------------------------------------------------------------

stacked_barplot(data, 'Online', 'Personal_Loan')

Personal_Loan     0    1   All
Online                        
All            4520  480  5000
1              2693  291  2984
0              1827  189  2016
------------------------------------------------------------------------------------------------------------------------

stacked_barplot(data, 'CreditCard', 'Personal_Loan')

Personal_Loan     0    1   All
CreditCard                    
All            4520  480  5000
0              3193  337  3530
1              1327  143  1470
------------------------------------------------------------------------------------------------------------------------

# outlier detection using boxplot

plt.figure(figsize=(15, 12))

for i, variable in enumerate(num_features):
    plt.subplot(4, 4, i + 1)
    plt.boxplot(data[variable], whis=1.5)
    plt.tight_layout()
    plt.title(variable)

plt.show()

#We won't be using ID or ZIPCode so remove it.
data.drop(['ID', 'ZIPCode'], axis=1, inplace=True)

# defining the explanatory (independent) and response (dependent) variables
X = data.drop(["Personal_Loan"], axis=1)
y = data["Personal_Loan"]

# splitting the data in an 70:30 ratio for train and test sets
# stratify ensures that the training and test sets have a similar distribution of the response variable
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, stratify=y, random_state=42)

print("Shape of Training set : ", X_train.shape)
print("Shape of test set : ", X_test.shape)
print("Percentage of classes in training set:")
print(y_train.value_counts(normalize=True))
print("Percentage of classes in test set:")
print(y_test.value_counts(normalize=True))

Shape of Training set :  (3500, 11)
Shape of test set :  (1500, 11)
Percentage of classes in training set:
Personal_Loan
0    0.904
1    0.096
Name: proportion, dtype: float64
Percentage of classes in test set:
Personal_Loan
0    0.904
1    0.096
Name: proportion, dtype: float64

# defining a function to compute different metrics to check performance of a classification model built using sklearn
def model_performance_classification(model, predictors, target):
    """
    Function to compute different metrics to check classification model performance

    model: classifier
    predictors: independent variables
    target: dependent variable
    """

    # predicting using the independent variables
    pred = model.predict(predictors)

    acc = accuracy_score(target, pred)  # to compute Accuracy
    recall = recall_score(target, pred)  # to compute Recall
    precision = precision_score(target, pred)  # to compute Precision
    f1 = f1_score(target, pred)  # to compute F1-score

    # creating a dataframe of metrics
    df_perf = pd.DataFrame(
        {"Accuracy": acc, "Recall": recall, "Precision": precision, "F1": f1,},
        index=[0],
    )

    return df_perf

def plot_confusion_matrix(model, predictors, target):
    """
    To plot the confusion_matrix with percentages

    model: classifier
    predictors: independent variables
    target: dependent variable
    """
    # Predict the target values using the provided model and predictors
    y_pred = model.predict(predictors)

    # Compute the confusion matrix comparing the true target values with the predicted values
    cm = confusion_matrix(target, y_pred)

    # Create labels for each cell in the confusion matrix with both count and percentage
    labels = np.asarray(
        [
            ["{0:0.0f}".format(item) + "\n{0:.2%}".format(item / cm.flatten().sum())]
            for item in cm.flatten()
        ]
    ).reshape(2, 2)    # reshaping to a matrix

    # Set the figure size for the plot
    plt.figure(figsize=(6, 4))

    # Plot the confusion matrix as a heatmap with the labels
    sns.heatmap(cm, annot=labels, fmt="")

    # Add a label to the y-axis
    plt.ylabel("True label")

    # Add a label to the x-axis
    plt.xlabel("Predicted label")

# creating an instance of the decision tree model
dtree1 = DecisionTreeClassifier(random_state=42)    # random_state sets a seed value and enables reproducibility

# fitting the model to the training data
dtree1.fit(X_train, y_train)

DecisionTreeClassifier(random_state=42)

DecisionTreeClassifier(random_state=42)

plot_confusion_matrix(dtree1, X_train, y_train)

dtree1_train_perf = model_performance_classification(
    dtree1, X_train, y_train
)
dtree1_train_perf

plot_confusion_matrix(dtree1, X_test, y_test)

dtree1_test_perf = model_performance_classification(
    dtree1, X_test, y_test
)
dtree1_test_perf

# Visualizing the Decision Tree (default)
# list of feature names in X_train
feature_names = list(X_train.columns)

# set the figure size for the plot
plt.figure(figsize=(20, 20))

# plotting the decision tree
out = tree.plot_tree(
    dtree1,                         # decision tree classifier model
    feature_names=feature_names,    # list of feature names (columns) in the dataset
    filled=True,                    # fill the nodes with colors based on class
    fontsize=9,                     # font size for the node text
    node_ids=False,                 # do not show the ID of each node
    class_names=None,               # whether or not to display class names
)

# add arrows to the decision tree splits if they are missing
for o in out:
    arrow = o.arrow_patch
    if arrow is not None:
        arrow.set_edgecolor("black")    # set arrow color to black
        arrow.set_linewidth(1)          # set arrow linewidth to 1

# displaying the plot
plt.show()

# Text report showing the rules of a decision tree -

print(tree.export_text(dtree1, feature_names=feature_names, show_weights=True))

|--- Income <= 98.50
|   |--- CCAvg <= 2.95
|   |   |--- weights: [2483.00, 0.00] class: 0
|   |--- CCAvg >  2.95
|   |   |--- CD_Account <= 0.50
|   |   |   |--- Age <= 27.00
|   |   |   |   |--- weights: [0.00, 2.00] class: 1
|   |   |   |--- Age >  27.00
|   |   |   |   |--- Income <= 92.50
|   |   |   |   |   |--- CCAvg <= 3.65
|   |   |   |   |   |   |--- Mortgage <= 216.50
|   |   |   |   |   |   |   |--- Income <= 82.50
|   |   |   |   |   |   |   |   |--- Experience <= 18.50
|   |   |   |   |   |   |   |   |   |--- Age <= 43.00
|   |   |   |   |   |   |   |   |   |   |--- Education <= 1.50
|   |   |   |   |   |   |   |   |   |   |   |--- weights: [13.00, 0.00] class: 0
|   |   |   |   |   |   |   |   |   |   |--- Education >  1.50
|   |   |   |   |   |   |   |   |   |   |   |--- truncated branch of depth 2
|   |   |   |   |   |   |   |   |   |--- Age >  43.00
|   |   |   |   |   |   |   |   |   |   |--- weights: [0.00, 1.00] class: 1
|   |   |   |   |   |   |   |   |--- Experience >  18.50
|   |   |   |   |   |   |   |   |   |--- weights: [24.00, 0.00] class: 0
|   |   |   |   |   |   |   |--- Income >  82.50
|   |   |   |   |   |   |   |   |--- CCAvg <= 3.05
|   |   |   |   |   |   |   |   |   |--- weights: [6.00, 0.00] class: 0
|   |   |   |   |   |   |   |   |--- CCAvg >  3.05
|   |   |   |   |   |   |   |   |   |--- Education <= 2.50
|   |   |   |   |   |   |   |   |   |   |--- Family <= 1.50
|   |   |   |   |   |   |   |   |   |   |   |--- truncated branch of depth 2
|   |   |   |   |   |   |   |   |   |   |--- Family >  1.50
|   |   |   |   |   |   |   |   |   |   |   |--- weights: [7.00, 0.00] class: 0
|   |   |   |   |   |   |   |   |   |--- Education >  2.50
|   |   |   |   |   |   |   |   |   |   |--- Mortgage <= 94.00
|   |   |   |   |   |   |   |   |   |   |   |--- weights: [0.00, 4.00] class: 1
|   |   |   |   |   |   |   |   |   |   |--- Mortgage >  94.00
|   |   |   |   |   |   |   |   |   |   |   |--- weights: [1.00, 0.00] class: 0
|   |   |   |   |   |   |--- Mortgage >  216.50
|   |   |   |   |   |   |   |--- Income <= 68.00
|   |   |   |   |   |   |   |   |--- weights: [0.00, 2.00] class: 1
|   |   |   |   |   |   |   |--- Income >  68.00
|   |   |   |   |   |   |   |   |--- weights: [1.00, 0.00] class: 0
|   |   |   |   |   |--- CCAvg >  3.65
|   |   |   |   |   |   |--- Mortgage <= 89.00
|   |   |   |   |   |   |   |--- weights: [43.00, 0.00] class: 0
|   |   |   |   |   |   |--- Mortgage >  89.00
|   |   |   |   |   |   |   |--- Mortgage <= 99.50
|   |   |   |   |   |   |   |   |--- weights: [0.00, 1.00] class: 1
|   |   |   |   |   |   |   |--- Mortgage >  99.50
|   |   |   |   |   |   |   |   |--- weights: [13.00, 0.00] class: 0
|   |   |   |   |--- Income >  92.50
|   |   |   |   |   |--- Education <= 1.50
|   |   |   |   |   |   |--- weights: [6.00, 0.00] class: 0
|   |   |   |   |   |--- Education >  1.50
|   |   |   |   |   |   |--- Income <= 96.50
|   |   |   |   |   |   |   |--- weights: [0.00, 5.00] class: 1
|   |   |   |   |   |   |--- Income >  96.50
|   |   |   |   |   |   |   |--- Age <= 48.50
|   |   |   |   |   |   |   |   |--- weights: [2.00, 0.00] class: 0
|   |   |   |   |   |   |   |--- Age >  48.50
|   |   |   |   |   |   |   |   |--- weights: [0.00, 1.00] class: 1
|   |   |--- CD_Account >  0.50
|   |   |   |--- CCAvg <= 4.25
|   |   |   |   |--- weights: [0.00, 6.00] class: 1
|   |   |   |--- CCAvg >  4.25
|   |   |   |   |--- Mortgage <= 38.00
|   |   |   |   |   |--- weights: [0.00, 1.00] class: 1
|   |   |   |   |--- Mortgage >  38.00
|   |   |   |   |   |--- weights: [3.00, 0.00] class: 0
|--- Income >  98.50
|   |--- Education <= 1.50
|   |   |--- Family <= 2.50
|   |   |   |--- Income <= 99.50
|   |   |   |   |--- Family <= 1.50
|   |   |   |   |   |--- weights: [0.00, 2.00] class: 1
|   |   |   |   |--- Family >  1.50
|   |   |   |   |   |--- weights: [1.00, 0.00] class: 0
|   |   |   |--- Income >  99.50
|   |   |   |   |--- Income <= 104.50
|   |   |   |   |   |--- CCAvg <= 3.31
|   |   |   |   |   |   |--- weights: [17.00, 0.00] class: 0
|   |   |   |   |   |--- CCAvg >  3.31
|   |   |   |   |   |   |--- CCAvg <= 4.25
|   |   |   |   |   |   |   |--- Mortgage <= 124.50
|   |   |   |   |   |   |   |   |--- weights: [0.00, 3.00] class: 1
|   |   |   |   |   |   |   |--- Mortgage >  124.50
|   |   |   |   |   |   |   |   |--- weights: [1.00, 0.00] class: 0
|   |   |   |   |   |   |--- CCAvg >  4.25
|   |   |   |   |   |   |   |--- weights: [2.00, 0.00] class: 0
|   |   |   |   |--- Income >  104.50
|   |   |   |   |   |--- weights: [449.00, 0.00] class: 0
|   |   |--- Family >  2.50
|   |   |   |--- Income <= 113.50
|   |   |   |   |--- Online <= 0.50
|   |   |   |   |   |--- CCAvg <= 2.05
|   |   |   |   |   |   |--- Experience <= 15.50
|   |   |   |   |   |   |   |--- weights: [0.00, 1.00] class: 1
|   |   |   |   |   |   |--- Experience >  15.50
|   |   |   |   |   |   |   |--- weights: [2.00, 0.00] class: 0
|   |   |   |   |   |--- CCAvg >  2.05
|   |   |   |   |   |   |--- weights: [0.00, 3.00] class: 1
|   |   |   |   |--- Online >  0.50
|   |   |   |   |   |--- CD_Account <= 0.50
|   |   |   |   |   |   |--- weights: [9.00, 0.00] class: 0
|   |   |   |   |   |--- CD_Account >  0.50
|   |   |   |   |   |   |--- weights: [0.00, 1.00] class: 1
|   |   |   |--- Income >  113.50
|   |   |   |   |--- weights: [0.00, 49.00] class: 1
|   |--- Education >  1.50
|   |   |--- Income <= 114.50
|   |   |   |--- CCAvg <= 2.45
|   |   |   |   |--- Income <= 106.50
|   |   |   |   |   |--- weights: [28.00, 0.00] class: 0
|   |   |   |   |--- Income >  106.50
|   |   |   |   |   |--- Experience <= 8.00
|   |   |   |   |   |   |--- weights: [11.00, 0.00] class: 0
|   |   |   |   |   |--- Experience >  8.00
|   |   |   |   |   |   |--- Experience <= 31.50
|   |   |   |   |   |   |   |--- Family <= 3.50
|   |   |   |   |   |   |   |   |--- Age <= 36.50
|   |   |   |   |   |   |   |   |   |--- weights: [0.00, 2.00] class: 1
|   |   |   |   |   |   |   |   |--- Age >  36.50
|   |   |   |   |   |   |   |   |   |--- Mortgage <= 231.00
|   |   |   |   |   |   |   |   |   |   |--- CCAvg <= 1.05
|   |   |   |   |   |   |   |   |   |   |   |--- weights: [6.00, 0.00] class: 0
|   |   |   |   |   |   |   |   |   |   |--- CCAvg >  1.05
|   |   |   |   |   |   |   |   |   |   |   |--- truncated branch of depth 2
|   |   |   |   |   |   |   |   |   |--- Mortgage >  231.00
|   |   |   |   |   |   |   |   |   |   |--- weights: [0.00, 1.00] class: 1
|   |   |   |   |   |   |   |--- Family >  3.50
|   |   |   |   |   |   |   |   |--- weights: [0.00, 3.00] class: 1
|   |   |   |   |   |   |--- Experience >  31.50
|   |   |   |   |   |   |   |--- weights: [11.00, 0.00] class: 0
|   |   |   |--- CCAvg >  2.45
|   |   |   |   |--- CCAvg <= 4.65
|   |   |   |   |   |--- CCAvg <= 4.45
|   |   |   |   |   |   |--- Age <= 63.50
|   |   |   |   |   |   |   |--- Family <= 1.50
|   |   |   |   |   |   |   |   |--- Age <= 45.00
|   |   |   |   |   |   |   |   |   |--- Experience <= 2.50
|   |   |   |   |   |   |   |   |   |   |--- weights: [0.00, 1.00] class: 1
|   |   |   |   |   |   |   |   |   |--- Experience >  2.50
|   |   |   |   |   |   |   |   |   |   |--- weights: [5.00, 0.00] class: 0
|   |   |   |   |   |   |   |   |--- Age >  45.00
|   |   |   |   |   |   |   |   |   |--- weights: [0.00, 2.00] class: 1
|   |   |   |   |   |   |   |--- Family >  1.50
|   |   |   |   |   |   |   |   |--- Experience <= 20.50
|   |   |   |   |   |   |   |   |   |--- weights: [0.00, 9.00] class: 1
|   |   |   |   |   |   |   |   |--- Experience >  20.50
|   |   |   |   |   |   |   |   |   |--- Age <= 52.00
|   |   |   |   |   |   |   |   |   |   |--- weights: [3.00, 0.00] class: 0
|   |   |   |   |   |   |   |   |   |--- Age >  52.00
|   |   |   |   |   |   |   |   |   |   |--- Family <= 3.50
|   |   |   |   |   |   |   |   |   |   |   |--- weights: [0.00, 3.00] class: 1
|   |   |   |   |   |   |   |   |   |   |--- Family >  3.50
|   |   |   |   |   |   |   |   |   |   |   |--- weights: [1.00, 0.00] class: 0
|   |   |   |   |   |   |--- Age >  63.50
|   |   |   |   |   |   |   |--- weights: [2.00, 0.00] class: 0
|   |   |   |   |   |--- CCAvg >  4.45
|   |   |   |   |   |   |--- weights: [3.00, 0.00] class: 0
|   |   |   |   |--- CCAvg >  4.65
|   |   |   |   |   |--- weights: [0.00, 6.00] class: 1
|   |   |--- Income >  114.50
|   |   |   |--- Income <= 116.50
|   |   |   |   |--- CCAvg <= 1.10
|   |   |   |   |   |--- CCAvg <= 0.65
|   |   |   |   |   |   |--- weights: [0.00, 1.00] class: 1
|   |   |   |   |   |--- CCAvg >  0.65
|   |   |   |   |   |   |--- weights: [2.00, 0.00] class: 0
|   |   |   |   |--- CCAvg >  1.10
|   |   |   |   |   |--- weights: [0.00, 6.00] class: 1
|   |   |   |--- Income >  116.50
|   |   |   |   |--- weights: [0.00, 215.00] class: 1

# define the parameters of the tree to iterate over
max_depth_values = np.arange(2, 11, 1)
max_leaf_nodes_values = np.arange(10, 51, 5)
min_samples_split_values = np.arange(10, 51, 5)

# initialize variables to store the best model and its performance
best_estimator = None
best_score_diff = float('inf')

# iterate over all combinations of the specified parameter values
for max_depth in max_depth_values:
    for max_leaf_nodes in max_leaf_nodes_values:
        for min_samples_split in min_samples_split_values:

            # initialize the tree with the current set of parameters
            estimator = DecisionTreeClassifier(
                max_depth=max_depth,
                max_leaf_nodes=max_leaf_nodes,
                min_samples_split=min_samples_split,
                random_state=42
            )

            # fit the model to the training data
            estimator.fit(X_train, y_train)

            # make predictions on the training and test sets
            y_train_pred = estimator.predict(X_train)
            y_test_pred = estimator.predict(X_test)

            # Calculate recall scores for training and test sets
            train_recall_score = recall_score(y_train, y_train_pred)
            test_recall_score = recall_score(y_test, y_test_pred)

            # Calculate the absolute difference between training and test recall scores
            score_diff = abs(train_recall_score - test_recall_score)

            # update the best estimator and best score if the current one has a smaller score difference
            if score_diff < best_score_diff:
                best_score_diff = score_diff
                best_estimator = estimator
                best_test_score = test_recall_score

# Print the best parameters
print("Best parameters found:")
print(f"Max depth: {best_estimator.max_depth}")
print(f"Max leaf nodes: {best_estimator.max_leaf_nodes}")
print(f"Min samples split: {best_estimator.min_samples_split}")
print(f"Best test recall score: {best_test_score}")

Best parameters found:
Max depth: 8
Max leaf nodes: 25
Min samples split: 10
Best test recall score: 0.9305555555555556

# creating an instance of the best model
dtree2 = best_estimator

# fitting the best model to the training data
dtree2.fit(X_train, y_train)

DecisionTreeClassifier(max_depth=8, max_leaf_nodes=25, min_samples_split=10,
                       random_state=42)

DecisionTreeClassifier(max_depth=8, max_leaf_nodes=25, min_samples_split=10,
                       random_state=42)

plot_confusion_matrix(dtree2, X_train, y_train)

dtree2_train_perf = model_performance_classification(
    dtree2, X_train, y_train
)
dtree2_train_perf

plot_confusion_matrix(dtree2, X_test, y_test)

dtree2_test_perf = model_performance_classification(
    dtree2, X_test, y_test
)
dtree2_test_perf

# Visualizing the Decision Tree (default)
# list of feature names in X_train
feature_names = list(X_train.columns)

# set the figure size for the plot
plt.figure(figsize=(20, 20))

# plotting the decision tree
out = tree.plot_tree(
    dtree2,                         # decision tree classifier model
    feature_names=feature_names,    # list of feature names (columns) in the dataset
    filled=True,                    # fill the nodes with colors based on class
    fontsize=9,                     # font size for the node text
    node_ids=False,                 # do not show the ID of each node
    class_names=None,               # whether or not to display class names
)

# add arrows to the decision tree splits if they are missing
for o in out:
    arrow = o.arrow_patch
    if arrow is not None:
        arrow.set_edgecolor("black")    # set arrow color to black
        arrow.set_linewidth(1)          # set arrow linewidth to 1

# displaying the plot
plt.show()

# Text report showing the rules of a decision tree -

print(tree.export_text(dtree2, feature_names=feature_names, show_weights=True))

|--- Income <= 98.50
|   |--- CCAvg <= 2.95
|   |   |--- weights: [2483.00, 0.00] class: 0
|   |--- CCAvg >  2.95
|   |   |--- CD_Account <= 0.50
|   |   |   |--- Age <= 27.00
|   |   |   |   |--- weights: [0.00, 2.00] class: 1
|   |   |   |--- Age >  27.00
|   |   |   |   |--- Income <= 92.50
|   |   |   |   |   |--- CCAvg <= 3.65
|   |   |   |   |   |   |--- weights: [57.00, 10.00] class: 0
|   |   |   |   |   |--- CCAvg >  3.65
|   |   |   |   |   |   |--- weights: [56.00, 1.00] class: 0
|   |   |   |   |--- Income >  92.50
|   |   |   |   |   |--- Education <= 1.50
|   |   |   |   |   |   |--- weights: [6.00, 0.00] class: 0
|   |   |   |   |   |--- Education >  1.50
|   |   |   |   |   |   |--- weights: [2.00, 6.00] class: 1
|   |   |--- CD_Account >  0.50
|   |   |   |--- CCAvg <= 4.25
|   |   |   |   |--- weights: [0.00, 6.00] class: 1
|   |   |   |--- CCAvg >  4.25
|   |   |   |   |--- weights: [3.00, 1.00] class: 0
|--- Income >  98.50
|   |--- Education <= 1.50
|   |   |--- Family <= 2.50
|   |   |   |--- Income <= 99.50
|   |   |   |   |--- weights: [1.00, 2.00] class: 1
|   |   |   |--- Income >  99.50
|   |   |   |   |--- weights: [469.00, 3.00] class: 0
|   |   |--- Family >  2.50
|   |   |   |--- Income <= 113.50
|   |   |   |   |--- Online <= 0.50
|   |   |   |   |   |--- weights: [2.00, 4.00] class: 1
|   |   |   |   |--- Online >  0.50
|   |   |   |   |   |--- CD_Account <= 0.50
|   |   |   |   |   |   |--- weights: [9.00, 0.00] class: 0
|   |   |   |   |   |--- CD_Account >  0.50
|   |   |   |   |   |   |--- weights: [0.00, 1.00] class: 1
|   |   |   |--- Income >  113.50
|   |   |   |   |--- weights: [0.00, 49.00] class: 1
|   |--- Education >  1.50
|   |   |--- Income <= 114.50
|   |   |   |--- CCAvg <= 2.45
|   |   |   |   |--- Income <= 106.50
|   |   |   |   |   |--- weights: [28.00, 0.00] class: 0
|   |   |   |   |--- Income >  106.50
|   |   |   |   |   |--- Experience <= 8.00
|   |   |   |   |   |   |--- weights: [11.00, 0.00] class: 0
|   |   |   |   |   |--- Experience >  8.00
|   |   |   |   |   |   |--- Experience <= 31.50
|   |   |   |   |   |   |   |--- Family <= 3.50
|   |   |   |   |   |   |   |   |--- weights: [10.00, 5.00] class: 0
|   |   |   |   |   |   |   |--- Family >  3.50
|   |   |   |   |   |   |   |   |--- weights: [0.00, 3.00] class: 1
|   |   |   |   |   |   |--- Experience >  31.50
|   |   |   |   |   |   |   |--- weights: [11.00, 0.00] class: 0
|   |   |   |--- CCAvg >  2.45
|   |   |   |   |--- CCAvg <= 4.65
|   |   |   |   |   |--- CCAvg <= 4.45
|   |   |   |   |   |   |--- Age <= 63.50
|   |   |   |   |   |   |   |--- Family <= 1.50
|   |   |   |   |   |   |   |   |--- weights: [5.00, 3.00] class: 0
|   |   |   |   |   |   |   |--- Family >  1.50
|   |   |   |   |   |   |   |   |--- weights: [4.00, 12.00] class: 1
|   |   |   |   |   |   |--- Age >  63.50
|   |   |   |   |   |   |   |--- weights: [2.00, 0.00] class: 0
|   |   |   |   |   |--- CCAvg >  4.45
|   |   |   |   |   |   |--- weights: [3.00, 0.00] class: 0
|   |   |   |   |--- CCAvg >  4.65
|   |   |   |   |   |--- weights: [0.00, 6.00] class: 1
|   |   |--- Income >  114.50
|   |   |   |--- weights: [2.00, 222.00] class: 1

# Create an instance of the decision tree model
clf = DecisionTreeClassifier(random_state=42)

# Compute the cost complexity pruning path for the model using the training data
path = clf.cost_complexity_pruning_path(X_train, y_train)

# Extract the array of effective alphas from the pruning path
ccp_alphas = abs(path.ccp_alphas)

# Extract the array of total impurities at each alpha along the pruning path
impurities = path.impurities

pd.DataFrame(path)

# Create a figure
fig, ax = plt.subplots(figsize=(10, 5))

# Plot the total impurities versus effective alphas, excluding the last value,
# using markers at each data point and connecting them with steps
ax.plot(ccp_alphas[:-1], impurities[:-1], marker="o", drawstyle="steps-post")

# Set the x-axis label
ax.set_xlabel("Effective Alpha")

# Set the y-axis label
ax.set_ylabel("Total impurity of leaves")

# Set the title of the plot
ax.set_title("Total Impurity vs Effective Alpha for training set");

# Initialize an empty list to store the decision tree classifiers
clfs = []

# Iterate over each ccp_alpha value extracted from cost complexity pruning path
for ccp_alpha in ccp_alphas:
    # Create an instance of the DecisionTreeClassifier
    clf = DecisionTreeClassifier(ccp_alpha=ccp_alpha, random_state=42)

    # Fit the classifier to the training data
    clf.fit(X_train, y_train)

    # Append the trained classifier to the list
    clfs.append(clf)

# Print the number of nodes in the last tree along with its ccp_alpha value
print(
    "Number of nodes in the last tree is {} with ccp_alpha {}".format(
        clfs[-1].tree_.node_count, ccp_alphas[-1]
    )
)

Number of nodes in the last tree is 1 with ccp_alpha 0.04774589891961516

# Remove the last classifier and corresponding ccp_alpha value from the lists
clfs = clfs[:-1]
ccp_alphas = ccp_alphas[:-1]

# Extract the number of nodes in each tree classifier
node_counts = [clf.tree_.node_count for clf in clfs]

# Extract the maximum depth of each tree classifier
depth = [clf.tree_.max_depth for clf in clfs]

# Create a figure and a set of subplots
fig, ax = plt.subplots(2, 1, figsize=(10, 7))

# Plot the number of nodes versus ccp_alphas on the first subplot
ax[0].plot(ccp_alphas, node_counts, marker="o", drawstyle="steps-post")
ax[0].set_xlabel("Alpha")
ax[0].set_ylabel("Number of nodes")
ax[0].set_title("Number of nodes vs Alpha")

# Plot the depth of tree versus ccp_alphas on the second subplot
ax[1].plot(ccp_alphas, depth, marker="o", drawstyle="steps-post")
ax[1].set_xlabel("Alpha")
ax[1].set_ylabel("Depth of tree")
ax[1].set_title("Depth vs Alpha")

# Adjust the layout of the subplots to avoid overlap
fig.tight_layout()

recall_train = []  # Initialize an empty list to store recall scores for training set for each decision tree classifier

# Iterate through each decision tree classifier in 'clfs'
for clf in clfs:
    # Predict labels for the training set using the current decision tree classifier
    pred_train = clf.predict(X_train)

    # Calculate the recall score for the training set predictions compared to true labels
    values_train = recall_score(y_train, pred_train)

    # Append the calculated recall score to the recall_train list
    recall_train.append(values_train)

recall_test = []  # Initialize an empty list to store recall scores for test set for each decision tree classifier

# Iterate through each decision tree classifier in 'clfs'
for clf in clfs:
    # Predict labels for the test set using the current decision tree classifier
    pred_test = clf.predict(X_test)

    # Calculate the recall score for the test set predictions compared to true labels
    values_test = recall_score(y_test, pred_test)

    # Append the calculated recall score to the recall_test list
    recall_test.append(values_test)

# Create a figure
fig, ax = plt.subplots(figsize=(15, 5))
ax.set_xlabel("Alpha")  # Set the label for the x-axis
ax.set_ylabel("Recall Score")  # Set the label for the y-axis
ax.set_title("Recall Score vs Alpha for training and test sets")  # Set the title of the plot

# Plot the training Recall scores against alpha, using circles as markers and steps-post style
ax.plot(ccp_alphas, recall_train, marker="o", label="training", drawstyle="steps-post")

# Plot the testing Recall scores against alpha, using circles as markers and steps-post style
ax.plot(ccp_alphas, recall_test, marker="o", label="test", drawstyle="steps-post")

ax.legend();  # Add a legend to the plot

# creating the model where we get highest test Recall Score
index_best_model = np.argmax(recall_test)

# selcting the decision tree model corresponding to the highest test score
dtree3 = clfs[index_best_model]
print(dtree3)

DecisionTreeClassifier(ccp_alpha=0.0005388615216201423, random_state=42)

plot_confusion_matrix(dtree3, X_train, y_train)

dtree3_train_perf = model_performance_classification(
    dtree3, X_train, y_train
)
dtree3_train_perf

plot_confusion_matrix(dtree3, X_test, y_test)

dtree3_test_perf = model_performance_classification(
    dtree3, X_test, y_test
)
dtree3_test_perf

# list of feature names in X_train
feature_names = list(X_train.columns)

# set the figure size for the plot
plt.figure(figsize=(10, 7))

# plotting the decision tree
out = tree.plot_tree(
    dtree3,                         # decision tree classifier model
    feature_names=feature_names,    # list of feature names (columns) in the dataset
    filled=True,                    # fill the nodes with colors based on class
    fontsize=9,                     # font size for the node text
    node_ids=False,                 # do not show the ID of each node
    class_names=None,               # whether or not to display class names
)

# add arrows to the decision tree splits if they are missing
for o in out:
    arrow = o.arrow_patch
    if arrow is not None:
        arrow.set_edgecolor("black")    # set arrow color to black
        arrow.set_linewidth(1)          # set arrow linewidth to 1

# displaying the plot
plt.show()

# Text report showing the rules of a decision tree -

print(tree.export_text(dtree3, feature_names=feature_names, show_weights=True))

|--- Income <= 98.50
|   |--- CCAvg <= 2.95
|   |   |--- weights: [2483.00, 0.00] class: 0
|   |--- CCAvg >  2.95
|   |   |--- CD_Account <= 0.50
|   |   |   |--- Age <= 27.00
|   |   |   |   |--- weights: [0.00, 2.00] class: 1
|   |   |   |--- Age >  27.00
|   |   |   |   |--- Income <= 92.50
|   |   |   |   |   |--- weights: [113.00, 11.00] class: 0
|   |   |   |   |--- Income >  92.50
|   |   |   |   |   |--- Education <= 1.50
|   |   |   |   |   |   |--- weights: [6.00, 0.00] class: 0
|   |   |   |   |   |--- Education >  1.50
|   |   |   |   |   |   |--- weights: [2.00, 6.00] class: 1
|   |   |--- CD_Account >  0.50
|   |   |   |--- CCAvg <= 4.25
|   |   |   |   |--- weights: [0.00, 6.00] class: 1
|   |   |   |--- CCAvg >  4.25
|   |   |   |   |--- weights: [3.00, 1.00] class: 0
|--- Income >  98.50
|   |--- Education <= 1.50
|   |   |--- Family <= 2.50
|   |   |   |--- Income <= 99.50
|   |   |   |   |--- weights: [1.00, 2.00] class: 1
|   |   |   |--- Income >  99.50
|   |   |   |   |--- weights: [469.00, 3.00] class: 0
|   |   |--- Family >  2.50
|   |   |   |--- Income <= 113.50
|   |   |   |   |--- Online <= 0.50
|   |   |   |   |   |--- weights: [2.00, 4.00] class: 1
|   |   |   |   |--- Online >  0.50
|   |   |   |   |   |--- weights: [9.00, 1.00] class: 0
|   |   |   |--- Income >  113.50
|   |   |   |   |--- weights: [0.00, 49.00] class: 1
|   |--- Education >  1.50
|   |   |--- Income <= 114.50
|   |   |   |--- CCAvg <= 2.45
|   |   |   |   |--- Income <= 106.50
|   |   |   |   |   |--- weights: [28.00, 0.00] class: 0
|   |   |   |   |--- Income >  106.50
|   |   |   |   |   |--- Experience <= 8.00
|   |   |   |   |   |   |--- weights: [11.00, 0.00] class: 0
|   |   |   |   |   |--- Experience >  8.00
|   |   |   |   |   |   |--- Experience <= 31.50
|   |   |   |   |   |   |   |--- Family <= 3.50
|   |   |   |   |   |   |   |   |--- Age <= 36.50
|   |   |   |   |   |   |   |   |   |--- weights: [0.00, 2.00] class: 1
|   |   |   |   |   |   |   |   |--- Age >  36.50
|   |   |   |   |   |   |   |   |   |--- weights: [10.00, 3.00] class: 0
|   |   |   |   |   |   |   |--- Family >  3.50
|   |   |   |   |   |   |   |   |--- weights: [0.00, 3.00] class: 1
|   |   |   |   |   |   |--- Experience >  31.50
|   |   |   |   |   |   |   |--- weights: [11.00, 0.00] class: 0
|   |   |   |--- CCAvg >  2.45
|   |   |   |   |--- CCAvg <= 4.65
|   |   |   |   |   |--- weights: [14.00, 15.00] class: 1
|   |   |   |   |--- CCAvg >  4.65
|   |   |   |   |   |--- weights: [0.00, 6.00] class: 1
|   |   |--- Income >  114.50
|   |   |   |--- weights: [2.00, 222.00] class: 1

# Define the parameter grid using the same values we used in Decision Tree (Pre-pruning)
param_grid = {
    'max_depth': np.arange(2, 11, 1),
    'max_leaf_nodes': np.arange(10, 51, 5),
    'min_samples_split': np.arange(10, 51, 5)
}

# Initialize the DecisionTreeClassifier
clf = DecisionTreeClassifier(random_state=42)

# Define the scoring metric
scorer = make_scorer(recall_score)

# Initialize Grid Search with cross-validation
grid_search = GridSearchCV(estimator=clf, param_grid=param_grid, cv=5, scoring=scorer, n_jobs=-1)

# Fit the model
grid_search.fit(X_train, y_train)

# Get the best parameters
best_params = grid_search.best_params_
print("Best parameters found: ", best_params)

# The best model
dtree4 = grid_search.best_estimator_

Best parameters found:  {'max_depth': 5, 'max_leaf_nodes': 20, 'min_samples_split': 10}

plot_confusion_matrix(dtree4, X_train, y_train)

dtree4_train_perf = model_performance_classification(
    dtree4, X_train, y_train
)
dtree4_train_perf

plot_confusion_matrix(dtree4, X_test, y_test)

dtree4_test_perf = model_performance_classification(
    dtree4, X_test, y_test
)
dtree4_test_perf

# Visualizing the Decision Tree (default)
# list of feature names in X_train
feature_names = list(X_train.columns)

# set the figure size for the plot
plt.figure(figsize=(20, 20))

# plotting the decision tree
out = tree.plot_tree(
    dtree4,                         # decision tree classifier model
    feature_names=feature_names,    # list of feature names (columns) in the dataset
    filled=True,                    # fill the nodes with colors based on class
    fontsize=9,                     # font size for the node text
    node_ids=False,                 # do not show the ID of each node
    class_names=None,               # whether or not to display class names
)

# add arrows to the decision tree splits if they are missing
for o in out:
    arrow = o.arrow_patch
    if arrow is not None:
        arrow.set_edgecolor("black")    # set arrow color to black
        arrow.set_linewidth(1)          # set arrow linewidth to 1

# displaying the plot
plt.show()

# Text report showing the rules of a decision tree -

print(tree.export_text(dtree4, feature_names=feature_names, show_weights=True))

|--- Income <= 98.50
|   |--- CCAvg <= 2.95
|   |   |--- weights: [2483.00, 0.00] class: 0
|   |--- CCAvg >  2.95
|   |   |--- CD_Account <= 0.50
|   |   |   |--- Age <= 27.00
|   |   |   |   |--- weights: [0.00, 2.00] class: 1
|   |   |   |--- Age >  27.00
|   |   |   |   |--- Income <= 92.50
|   |   |   |   |   |--- weights: [113.00, 11.00] class: 0
|   |   |   |   |--- Income >  92.50
|   |   |   |   |   |--- weights: [8.00, 6.00] class: 0
|   |   |--- CD_Account >  0.50
|   |   |   |--- CCAvg <= 4.25
|   |   |   |   |--- weights: [0.00, 6.00] class: 1
|   |   |   |--- CCAvg >  4.25
|   |   |   |   |--- weights: [3.00, 1.00] class: 0
|--- Income >  98.50
|   |--- Education <= 1.50
|   |   |--- Family <= 2.50
|   |   |   |--- Income <= 99.50
|   |   |   |   |--- weights: [1.00, 2.00] class: 1
|   |   |   |--- Income >  99.50
|   |   |   |   |--- Income <= 104.50
|   |   |   |   |   |--- weights: [20.00, 3.00] class: 0
|   |   |   |   |--- Income >  104.50
|   |   |   |   |   |--- weights: [449.00, 0.00] class: 0
|   |   |--- Family >  2.50
|   |   |   |--- Income <= 113.50
|   |   |   |   |--- Online <= 0.50
|   |   |   |   |   |--- weights: [2.00, 4.00] class: 1
|   |   |   |   |--- Online >  0.50
|   |   |   |   |   |--- weights: [9.00, 1.00] class: 0
|   |   |   |--- Income >  113.50
|   |   |   |   |--- weights: [0.00, 49.00] class: 1
|   |--- Education >  1.50
|   |   |--- Income <= 114.50
|   |   |   |--- CCAvg <= 2.45
|   |   |   |   |--- Income <= 106.50
|   |   |   |   |   |--- weights: [28.00, 0.00] class: 0
|   |   |   |   |--- Income >  106.50
|   |   |   |   |   |--- weights: [32.00, 8.00] class: 0
|   |   |   |--- CCAvg >  2.45
|   |   |   |   |--- CCAvg <= 4.65
|   |   |   |   |   |--- weights: [14.00, 15.00] class: 1
|   |   |   |   |--- CCAvg >  4.65
|   |   |   |   |   |--- weights: [0.00, 6.00] class: 1
|   |   |--- Income >  114.50
|   |   |   |--- Income <= 116.50
|   |   |   |   |--- weights: [2.00, 7.00] class: 1
|   |   |   |--- Income >  116.50
|   |   |   |   |--- weights: [0.00, 215.00] class: 1

# training performance comparison

models_train_comp_df = pd.concat(
    [
        dtree1_train_perf.T,
        dtree2_train_perf.T,
        dtree3_train_perf.T,
        dtree4_train_perf.T,
    ],
    axis=1,
)
models_train_comp_df.columns = [
    "Decision Tree (sklearn default)",
    "Decision Tree (Pre-Pruning)",
    "Decision Tree (Post-Pruning)",
    "Decision Tree (GridSearchCV)",
]
print("Training performance comparison:")
models_train_comp_df

Training performance comparison:

# testing performance comparison

models_test_comp_df = pd.concat(
    [
        dtree1_test_perf.T,
        dtree2_test_perf.T,
        dtree3_test_perf.T,
        dtree4_test_perf.T,
    ],
    axis=1,
)
models_test_comp_df.columns = [
    "Decision Tree (sklearn default)",
    "Decision Tree (Pre-Pruning)",
    "Decision Tree (Post-Pruning)",
    "Decision Tree (GridSearchCV)",
]
print("Testing performance comparison:")
models_test_comp_df

Testing performance comparison:

# importance of features in the tree building
importances = dtree3.feature_importances_
indices = np.argsort(importances)

plt.figure(figsize=(8, 8))
plt.title("Feature Importances")
plt.barh(range(len(indices)), importances[indices], color="violet", align="center")
plt.yticks(range(len(indices)), [feature_names[i] for i in indices])
plt.xlabel("Relative Importance")
plt.show()

	Age	Experience	Income	Family	CCAvg	Education	Mortgage	Personal_Loan	Securities_Account	CD_Account	Online	CreditCard
count	5000.000000	5000.000000	5000.000000	5000.000000	5000.000000	5000.000000	5000.000000	5000.000000	5000.000000	5000.00000	5000.000000	5000.000000
mean	45.338400	20.104600	73.774200	2.396400	1.937938	1.881000	56.498800	0.096000	0.104400	0.06040	0.596800	0.294000
std	11.463166	11.467954	46.033729	1.147663	1.747659	0.839869	101.713802	0.294621	0.305809	0.23825	0.490589	0.455637
min	23.000000	-3.000000	8.000000	1.000000	0.000000	1.000000	0.000000	0.000000	0.000000	0.00000	0.000000	0.000000
25%	35.000000	10.000000	39.000000	1.000000	0.700000	1.000000	0.000000	0.000000	0.000000	0.00000	0.000000	0.000000
50%	45.000000	20.000000	64.000000	2.000000	1.500000	2.000000	0.000000	0.000000	0.000000	0.00000	1.000000	0.000000
75%	55.000000	30.000000	98.000000	3.000000	2.500000	3.000000	101.000000	0.000000	0.000000	0.00000	1.000000	1.000000
max	67.000000	43.000000	224.000000	4.000000	10.000000	3.000000	635.000000	1.000000	1.000000	1.00000	1.000000	1.000000

	ccp_alphas	impurities
0	0.000000	0.000000
1	0.000270	0.000540
2	0.000275	0.001090
3	0.000281	0.001651
4	0.000378	0.002784
5	0.000381	0.003165
6	0.000381	0.003546
7	0.000381	0.003927
8	0.000381	0.004308
9	0.000381	0.005069
10	0.000426	0.006773
11	0.000429	0.007201
12	0.000429	0.007630
13	0.000440	0.008949
14	0.000457	0.009406
15	0.000476	0.009882
16	0.000476	0.010358
17	0.000476	0.013216
18	0.000514	0.013730
19	0.000539	0.016964
20	0.000543	0.019679
21	0.000662	0.020341
22	0.000688	0.021029
23	0.000743	0.021771
24	0.000771	0.022543
25	0.000933	0.025341
26	0.001698	0.027040
27	0.002429	0.029468
28	0.003072	0.032540
29	0.003258	0.035798
30	0.020297	0.056095
31	0.021982	0.078076
32	0.047746	0.173568

	Decision Tree (sklearn default)	Decision Tree (Pre-Pruning)	Decision Tree (Post-Pruning)	Decision Tree (GridSearchCV)
Accuracy	0.980667	0.982000	0.982000	0.982667
Recall	0.909722	0.930556	0.958333	0.951389
Precision	0.891156	0.887417	0.867925	0.878205
F1	0.900344	0.908475	0.910891	0.913333

Machine Learning: AllLife Bank Personal Loan Campaign¶

Problem Statement¶

Context¶

Objective¶

Data Dictionary¶

Importing necessary libraries¶

Loading the dataset¶

Data Overview¶

Viewing the first and last 5 rows of the dataset¶

Checking the shape of the dataset¶

Checking the attribute types¶

Checking the statistical summary¶

Checking for missing values¶

Checking for duplicate values¶

Exploratory Data Analysis.¶

Some plotting functions we use.¶

Question 1. Distribution of mortgage attribute.¶

Question 2. Customers with credit cards from another bank.¶

Question 3. What are the attributes that have a strong correlation with the target attribute (personal loan)?¶

Question 4. How does a customer's interest in purchasing a loan vary with their age?¶

Question 5. How does a customer's interest in purchasing a loan vary with their education?¶

Univariate Analysis¶

Numerical Variables¶

Categorical Variables¶

Bivariate Analysis¶

Numerical Variables¶

Categorical Variables¶

Data Preprocessing¶

Model Building¶

Model Evaluation Criterion¶

Decision Tree (Default)¶

Decision Tree (Pre-Pruning)¶

Decision Tree (Post-Pruning)¶

Decision Tree (GridSearchCV)¶

Model Comparison and Final Model Selection¶

Actionable Insights and Business Recommendations¶

	Age	Experience	Income	Family	CCAvg	Education	Securities_Account	CreditCard
0	25	1	49	4	1.6	1	1	0
1	45	19	34	3	1.5	1	1	0
2	39	15	11	1	1.0	1	0	0
3	35	9	100	1	2.7	2	0	0
4	35	8	45	4	1.0	2	0	1

	Age	Experience	Income	Family	CCAvg	Education	Mortgage	Online	CreditCard
4995	29	3	40	1	1.9	3	0	1	0
4996	30	4	15	4	0.4	1	85	1	0
4997	63	39	24	2	0.3	3	0	0	0
4998	65	40	49	3	0.5	2	0	1	0
4999	28	4	83	3	0.8	1	0	1	1

	Decision Tree (sklearn default)	Decision Tree (Pre-Pruning)	Decision Tree (Post-Pruning)	Decision Tree (GridSearchCV)
Accuracy	1.0	0.990286	0.988571	0.986000
Recall	1.0	0.931548	0.943452	0.910714
Precision	1.0	0.966049	0.937870	0.941538
F1	1.0	0.948485	0.940653	0.925870

	Age	Experience	Income	Family	CCAvg	Education	Securities_Account	CreditCard
0	25	1	49	4	1.6	1	1	0
1	45	19	34	3	1.5	1	1	0
2	39	15	11	1	1.0	1	0	0
3	35	9	100	1	2.7	2	0	0
4	35	8	45	4	1.0	2	0	1

	Age	Experience	Income	Family	CCAvg	Education	Securities_Account	CreditCard
0	25	1	49	4	1.6	1	1	0
1	45	19	34	3	1.5	1	1	0
2	39	15	11	1	1.0	1	0	0
3	35	9	100	1	2.7	2	0	0
4	35	8	45	4	1.0	2	0	1