# Import pandas to read csv file and manage heterogenous data
import pandas as pd

# Import numpy to store numeric information and perform numerical analysis
import numpy as np

# Import matplotlib.pyplot and seaborn for data visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Import the dataframe
df = pd.read_csv("online_shoppers_intention.csv")
df.head()

df = df.rename(columns={'Revenue': 'Purchase'})
df.dtypes

Administrative               int64
Administrative_Duration    float64
Informational                int64
Informational_Duration     float64
ProductRelated               int64
ProductRelated_Duration    float64
BounceRates                float64
ExitRates                  float64
PageValues                 float64
SpecialDay                 float64
Month                       object
OperatingSystems             int64
Browser                      int64
Region                       int64
TrafficType                  int64
VisitorType                 object
Weekend                       bool
Purchase                      bool
dtype: object

# Convert unbinary element into binary value
df["Purchase"] = df["Purchase"].replace([True, False],[1, 0])
df["Weekend"] = df["Weekend"].replace([True, False],[1, 0])
df = df.drop(columns = ['OperatingSystems', 'Browser', 'Region', 'TrafficType'])

def one_hot(df, column):
    """
    one hot encoding function to convert categorical feature into a set of binary features and append it to the dataframe
    
    Arguments:
        df: DataFrame
        column: str
            the column that need to be one hot encoding
    """ 
    # get the new binary features based on the given column
    onehot = pd.get_dummies(df[column]).astype(int)
    # rename the new columns
    onehot.columns = [column + '_' + str(x) for x in onehot.columns]
    # drop the original column
    tmp = df.drop(columns = [column])
    # concate the new columns to the dataframe
    tmp = pd.concat([onehot, tmp], axis=1)
    return tmp

categorical_cols = ['VisitorType', 'Month']
for c in categorical_cols:
    df = one_hot(df, c)

df.columns

Index(['Month_Aug', 'Month_Dec', 'Month_Feb', 'Month_Jul', 'Month_June',
       'Month_Mar', 'Month_May', 'Month_Nov', 'Month_Oct', 'Month_Sep',
       'VisitorType_New_Visitor', 'VisitorType_Other',
       'VisitorType_Returning_Visitor', 'Administrative',
       'Administrative_Duration', 'Informational', 'Informational_Duration',
       'ProductRelated', 'ProductRelated_Duration', 'BounceRates', 'ExitRates',
       'PageValues', 'SpecialDay', 'Weekend', 'Purchase'],
      dtype='object')

df.head()

# have a basic sense of the whole datase
df.describe()

info_adm_pdt = df[['Administrative',
       'Administrative_Duration', 'Informational', 'Informational_Duration',
       'ProductRelated', 'ProductRelated_Duration','Purchase']]
sns.pairplot(info_adm_pdt, hue="Purchase")

<seaborn.axisgrid.PairGrid at 0x7fe85b229610>

plt.hist(df['Administrative'],alpha = 0.6,label = "Administrative", bins = np.arange(0,30,5))
plt.hist(df['Informational'],alpha = 0.6,label = "Informational", bins = np.arange(0,30,5))
#plt.hist(df['ProductRelated'],alpha = 0.6,label = "ProductRelated")
plt.legend()

<matplotlib.legend.Legend at 0x7fe85b660a90>

plt.hist(df['ProductRelated'],alpha = 0.6,label = "ProductRelated",color = 'g')
pg_count3 = df['ProductRelated'].median()
plt.axvline(x = pg_count3, color = 'r') # red line indicating the median value
plt.legend()

<matplotlib.legend.Legend at 0x7fe85edd6820>

plt.hist(df['Administrative_Duration'],alpha = 0.6,label = "Administrative", bins = np.arange(0,4000, 200))
plt.hist(df['Informational_Duration'],alpha = 0.6,label = "Informational", bins = np.arange(0,4000, 200))
plt.legend()

<matplotlib.legend.Legend at 0x7fe85f7b57f0>

plt.hist(df['ProductRelated_Duration'],alpha = 0.6,label = "ProductRelated",color = 'g')
duration_count3 = df['ProductRelated_Duration'].median()
plt.axvline(x = duration_count3, color = 'r') # red line indicating the median value
plt.legend()

<matplotlib.legend.Legend at 0x7fe85f7ad7f0>

# Bounce Rate vs. Exit Rates
plt.scatter(df['BounceRates'], df['ExitRates'],alpha = 0.5)
plt.xlabel('Bounce Rates')
plt.ylabel('Exit Rates')

Text(0, 0.5, 'Exit Rates')

buy_bounce_rate = df[df['Purchase'] == 1]['BounceRates'].mean()
not_buy_bounce_rate = df[df['Purchase'] == 0]['BounceRates'].mean()
print(buy_bounce_rate,not_buy_bounce_rate)
buy_exit_rate = df[df['Purchase'] == 1]['ExitRates'].mean()
not_buy_exit_rate = df[df['Purchase'] == 0]['ExitRates'].mean()
print(buy_exit_rate,not_buy_exit_rate)

0.005117152640461212 0.025317232197850356
0.019555168256813433 0.04737827052648154

buy_special_day = df[(df['SpecialDay'] == 1) & (df['Purchase'] == 1)].shape[0]
not_buy_special_day = df[(df['SpecialDay'] == 1) & (df['Purchase'] == 0)].shape[0]
speical_day_count = df[(df['SpecialDay'] == 1)].shape[0]
print(f"The buy rate on special day is {buy_special_day/speical_day_count}")
print(f"The not-buy rate on special day is {not_buy_special_day/speical_day_count}")

The buy rate on special day is 0.06493506493506493
The not-buy rate on special day is 0.935064935064935

x1 = [(buy_special_day/speical_day_count),(not_buy_special_day/speical_day_count)]
plt.figure()
plt.bar(['buy','not buy'],x1, label = 'special day',color = 'r',alpha = 0.6)
plt.legend()

<matplotlib.legend.Legend at 0x7fe85e610640>

buy_normal_day = df[(df['SpecialDay'] == 0) & (df['Purchase'] == 1)].shape[0]
not_buy_normal_day = df[(df['SpecialDay'] == 0) & (df['Purchase'] == 0)].shape[0]
normal_day_count = df[(df['SpecialDay'] == 0)].shape[0]
print(f"The buy rate on normal day is {buy_normal_day/normal_day_count}")
print(f"The not-buy rate on normal day is {not_buy_normal_day/normal_day_count}")

The buy rate on normal day is 0.16526762343171766
The not-buy rate on normal day is 0.8347323765682824

x2 = [(buy_normal_day/(normal_day_count)),(not_buy_normal_day/normal_day_count)]
plt.figure()
plt.bar(['buy','not buy'],x2,label = 'normal day',color = 'b', alpha = 0.6)
plt.legend()

<matplotlib.legend.Legend at 0x7fe85eba1550>

month_lst = ['Month_Aug', 'Month_Dec', 'Month_Feb', 'Month_Jul', 'Month_June',
       'Month_Mar', 'Month_May', 'Month_Nov', 'Month_Oct', 'Month_Sep']
buying_count_over_month = [df[(df[i] == 1) & (df['Purchase'] == 1)].shape[0] for i in month_lst]
for i in range(len(buying_count_over_month)):
    print(f"The buying count in {month_lst[i]} month is {buying_count_over_month[i]}")

The buying count in Month_Aug month is 76
The buying count in Month_Dec month is 216
The buying count in Month_Feb month is 3
The buying count in Month_Jul month is 66
The buying count in Month_June month is 29
The buying count in Month_Mar month is 192
The buying count in Month_May month is 365
The buying count in Month_Nov month is 760
The buying count in Month_Oct month is 115
The buying count in Month_Sep month is 86

from matplotlib.pyplot import figure
figure(figsize=(17, 6), dpi=80)
buying_count_over_month = [df[(df[i] == 1) & (df['Purchase'] == 1)].shape[0] for i in month_lst]
month_lst = ['Month_Feb', 'Month_Mar', 'Month_May', 'Month_June', 'Month_Jul',
       'Month_Aug', 'Month_Sep', 'Month_Oct', 'Month_Nov', 'Month_Dec']
x =  [buying_count_over_month[i] for i in range(len(buying_count_over_month))]
plt.bar(month_lst,x)

<BarContainer object of 10 artists>

buy_weekend = df[(df['Weekend'] == 1) & (df['Purchase'] == 1)].shape[0]
not_buy_weekend = df[(df['Weekend'] == 1) & (df['Purchase'] == 0)].shape[0]
print(buy_weekend,not_buy_weekend,buy_weekend/(buy_weekend+not_buy_weekend))

buy_nonweekend = df[(df['Weekend'] == 0) & (df['Purchase'] == 1)].shape[0]
not_buy_nonweekend = df[(df['Weekend'] == 0) & (df['Purchase'] == 0)].shape[0]
print(buy_nonweekend,not_buy_nonweekend,buy_nonweekend/(buy_nonweekend+not_buy_nonweekend))

499 2369 0.17398884239888424
1409 8053 0.1489114352145424

sns. histplot(df, x = 'Weekend', hue = 'Purchase', binrange = (0,2), binwidth=1)

<AxesSubplot:xlabel='Weekend', ylabel='Count'>

new_buy = df[(df['VisitorType_New_Visitor'] == 1) & (df['Purchase'] == 1)].shape[0]
new_ = df[(df['VisitorType_New_Visitor'] == 1)].shape[0]

return_buy = df[(df['VisitorType_Returning_Visitor'] == 1) & (df['Purchase'] == 1)].shape[0]
return_  = df[(df['VisitorType_Returning_Visitor'] == 1)].shape[0]

print(f"The percentage of new users buying the product is {new_buy/new_}")
print(f"The percentage of returning users buying the product is {return_buy/return_}")

The percentage of new users buying the product is 0.24911452184179456
The percentage of returning users buying the product is 0.1393232868922377

plt.rcParams['figure.figsize'] = (16, 5)
counter = [df[df['VisitorType_New_Visitor'] == 1].shape[0], 
           df[df['VisitorType_Returning_Visitor'] == 1].shape[0], 
           df[df['VisitorType_Other']==1].shape[0]]
plt.figure()
plt.pie(counter,autopct = '%.2f%%', labels = ['New Visitor','Returning Visitor','Other'])
plt.title('Visitor Type Pie Chart')
plt.legend()

<matplotlib.legend.Legend at 0x7fe85fac7640>

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import metrics
from sklearn.metrics import classification_report
from sklearn.metrics import roc_auc_score, roc_curve
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn import datasets
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold
import eli5
from eli5.sklearn import PermutationImportance

X = df.drop('Purchase', axis = 1)
y = df['Purchase']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

# select Logistic Regression as our model
clf = LogisticRegression(random_state=0, max_iter = 50000).fit(X_train, y_train)
y_pred = clf.predict(X_test)

# get the accuracy for our model
score = clf.score(X_test, y_test)
print("The accuracy we got in our testing set is ", score)

The accuracy we got in our testing set is  0.889294403892944

print(classification_report(y_test, y_pred))

# get and plot the confusion matrix for our model
cm1 = metrics.confusion_matrix(y_test, y_pred)
fig, ax = plt.subplots(figsize=(7.5, 7.5))
ax.matshow(cm1, cmap=plt.cm.Reds, alpha=0.3)
for i in range(cm1.shape[0]):
    for j in range(cm1.shape[1]):
        ax.text(x=j, y=i,s=cm1[i, j], va='center', ha='center', size='xx-large')

plt.xlabel('Predictions', fontsize=18)
plt.ylabel('Actuals', fontsize=18)
plt.title('Confusion Matrix', fontsize=18)
plt.show()

              precision    recall  f1-score   support

           0       0.91      0.97      0.94      2115
           1       0.70      0.38      0.50       351

    accuracy                           0.89      2466
   macro avg       0.80      0.68      0.72      2466
weighted avg       0.88      0.89      0.88      2466

fpr = roc_curve(y_test, clf.predict_proba(X_test)[:,1])[0]  # false positive rate
tpr = roc_curve(y_test, clf.predict_proba(X_test)[:,1])[1]  # true positive rate
roc_auc = roc_auc_score(y_test, clf.predict_proba(X_test)[:,1])

# get and plot the ROC curve for our model
plt.figure(dpi=100)
plt.plot(fpr, tpr)
plt.title('ROC curve')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')

print('Area under the Receiver Operating Characteristic curve:', 
      roc_auc)

Area under the Receiver Operating Characteristic curve: 0.9080236810733264

# finding the Permutation importance

perm = PermutationImportance(clf).fit(X_test, y_test)
eli5.show_weights(perm, feature_names = X_test.columns.tolist())

# create a pipeline 
pipe = Pipeline([('std', StandardScaler()),
                 ('classifier', LogisticRegression())])

# create search space of candidate learning algorithms and their hyperparameters
search_space = [{'classifier': [LogisticRegression(max_iter=1000)],
                 'classifier__solver': ['saga'],
                 'classifier__penalty': ['l1', 'l2'],
                 'classifier__C': np.logspace(-4, 4, 9)},
                {'classifier': [LogisticRegression(max_iter=1000)],
                 'classifier__solver': ['lbfgs'],
                 'classifier__penalty': ['l2'],
                 'classifier__C': np.logspace(-4, 4, 9)},
                {'classifier': [LogisticRegression(max_iter=1000)],
                 'classifier__solver': ['lbfgs','saga'],
                 'classifier__penalty': ['none']}
                ]


# create grid search 
clf = GridSearchCV(pipe, search_space, cv=StratifiedKFold(n_splits=5), n_jobs = -1,
                   scoring=['accuracy', 'roc_auc_ovr', 'f1_micro'], refit=False,
                   verbose=0)

# fit grid search
best_model = clf.fit(X_train, y_train)

best_model.cv_results_

{'mean_fit_time': array([0.02849622, 0.07590842, 0.08350377, 0.06417475, 0.12940178,
        0.14366174, 0.55658669, 0.38879089, 0.72470627, 0.50435376,
        0.64737802, 0.5357574 , 0.69090986, 0.54721751, 0.70645866,
        0.54729466, 0.6764832 , 0.53953753, 1.42450256, 0.02621069,
        0.0372375 , 0.04630599, 0.0576108 , 0.04791732, 0.05091891,
        0.04991126, 0.04822764, 0.0458199 , 0.40240765]),
 'std_fit_time': array([0.00361964, 0.00530308, 0.00471964, 0.00142439, 0.00843137,
        0.0181239 , 0.06751444, 0.08809696, 0.23596016, 0.14233427,
        0.15380096, 0.15792707, 0.19492574, 0.14347641, 0.20632176,
        0.15277735, 0.15639909, 0.13313352, 0.69776842, 0.0011663 ,
        0.00123044, 0.00415285, 0.00537088, 0.00257151, 0.00546733,
        0.0049127 , 0.00303712, 0.00813633, 0.15317393]),
 'mean_score_time': array([0.01127534, 0.01253233, 0.01231728, 0.01136942, 0.01183848,
        0.00989614, 0.01006427, 0.01021094, 0.01009674, 0.00980864,
        0.00998559, 0.0099493 , 0.00987554, 0.01027188, 0.00974879,
        0.01006694, 0.00965524, 0.00993476, 0.01066861, 0.01139941,
        0.01082945, 0.01197596, 0.01182032, 0.01199479, 0.01152382,
        0.0117857 , 0.01210637, 0.01333613, 0.00734215]),
 'std_score_time': array([0.0007505 , 0.00194321, 0.00298983, 0.00056215, 0.00284494,
        0.00028162, 0.00037799, 0.0011404 , 0.00069172, 0.00032584,
        0.00042727, 0.00036324, 0.00059416, 0.0003933 , 0.00017076,
        0.00095975, 0.00028477, 0.00027073, 0.00104854, 0.00102579,
        0.00068365, 0.00070992, 0.00049337, 0.00049866, 0.00024777,
        0.00110073, 0.00216705, 0.00333745, 0.00110959]),
 'param_classifier': masked_array(data=[LogisticRegression(max_iter=1000),
                    LogisticRegression(max_iter=1000),
                    LogisticRegression(max_iter=1000),
                    LogisticRegression(max_iter=1000),
                    LogisticRegression(max_iter=1000),
                    LogisticRegression(max_iter=1000),
                    LogisticRegression(max_iter=1000),
                    LogisticRegression(max_iter=1000),
                    LogisticRegression(max_iter=1000),
                    LogisticRegression(max_iter=1000),
                    LogisticRegression(max_iter=1000),
                    LogisticRegression(max_iter=1000),
                    LogisticRegression(max_iter=1000),
                    LogisticRegression(max_iter=1000),
                    LogisticRegression(max_iter=1000),
                    LogisticRegression(max_iter=1000),
                    LogisticRegression(max_iter=1000),
                    LogisticRegression(max_iter=1000),
                    LogisticRegression(max_iter=1000),
                    LogisticRegression(max_iter=1000),
                    LogisticRegression(max_iter=1000),
                    LogisticRegression(max_iter=1000),
                    LogisticRegression(max_iter=1000),
                    LogisticRegression(max_iter=1000),
                    LogisticRegression(max_iter=1000),
                    LogisticRegression(max_iter=1000),
                    LogisticRegression(max_iter=1000),
                    LogisticRegression(max_iter=1000),
                    LogisticRegression(max_iter=1000)],
              mask=[False, False, False, False, False, False, False, False,
                    False, False, False, False, False, False, False, False,
                    False, False, False, False, False, False, False, False,
                    False, False, False, False, False],
        fill_value='?',
             dtype=object),
 'param_classifier__C': masked_array(data=[0.0001, 0.0001, 0.001, 0.001, 0.01, 0.01, 0.1, 0.1,
                    1.0, 1.0, 10.0, 10.0, 100.0, 100.0, 1000.0, 1000.0,
                    10000.0, 10000.0, 0.0001, 0.001, 0.01, 0.1, 1.0, 10.0,
                    100.0, 1000.0, 10000.0, --, --],
              mask=[False, False, False, False, False, False, False, False,
                    False, False, False, False, False, False, False, False,
                    False, False, False, False, False, False, False, False,
                    False, False, False,  True,  True],
        fill_value='?',
             dtype=object),
 'param_classifier__penalty': masked_array(data=['l1', 'l2', 'l1', 'l2', 'l1', 'l2', 'l1', 'l2', 'l1',
                    'l2', 'l1', 'l2', 'l1', 'l2', 'l1', 'l2', 'l1', 'l2',
                    'l2', 'l2', 'l2', 'l2', 'l2', 'l2', 'l2', 'l2', 'l2',
                    'none', 'none'],
              mask=[False, False, False, False, False, False, False, False,
                    False, False, False, False, False, False, False, False,
                    False, False, False, False, False, False, False, False,
                    False, False, False, False, False],
        fill_value='?',
             dtype=object),
 'param_classifier__solver': masked_array(data=['saga', 'saga', 'saga', 'saga', 'saga', 'saga', 'saga',
                    'saga', 'saga', 'saga', 'saga', 'saga', 'saga', 'saga',
                    'saga', 'saga', 'saga', 'saga', 'lbfgs', 'lbfgs',
                    'lbfgs', 'lbfgs', 'lbfgs', 'lbfgs', 'lbfgs', 'lbfgs',
                    'lbfgs', 'lbfgs', 'saga'],
              mask=[False, False, False, False, False, False, False, False,
                    False, False, False, False, False, False, False, False,
                    False, False, False, False, False, False, False, False,
                    False, False, False, False, False],
        fill_value='?',
             dtype=object),
 'params': [{'classifier': LogisticRegression(max_iter=1000),
   'classifier__C': 0.0001,
   'classifier__penalty': 'l1',
   'classifier__solver': 'saga'},
  {'classifier': LogisticRegression(max_iter=1000),
   'classifier__C': 0.0001,
   'classifier__penalty': 'l2',
   'classifier__solver': 'saga'},
  {'classifier': LogisticRegression(max_iter=1000),
   'classifier__C': 0.001,
   'classifier__penalty': 'l1',
   'classifier__solver': 'saga'},
  {'classifier': LogisticRegression(max_iter=1000),
   'classifier__C': 0.001,
   'classifier__penalty': 'l2',
   'classifier__solver': 'saga'},
  {'classifier': LogisticRegression(max_iter=1000),
   'classifier__C': 0.01,
   'classifier__penalty': 'l1',
   'classifier__solver': 'saga'},
  {'classifier': LogisticRegression(max_iter=1000),
   'classifier__C': 0.01,
   'classifier__penalty': 'l2',
   'classifier__solver': 'saga'},
  {'classifier': LogisticRegression(max_iter=1000),
   'classifier__C': 0.1,
   'classifier__penalty': 'l1',
   'classifier__solver': 'saga'},
  {'classifier': LogisticRegression(max_iter=1000),
   'classifier__C': 0.1,
   'classifier__penalty': 'l2',
   'classifier__solver': 'saga'},
  {'classifier': LogisticRegression(max_iter=1000),
   'classifier__C': 1.0,
   'classifier__penalty': 'l1',
   'classifier__solver': 'saga'},
  {'classifier': LogisticRegression(max_iter=1000),
   'classifier__C': 1.0,
   'classifier__penalty': 'l2',
   'classifier__solver': 'saga'},
  {'classifier': LogisticRegression(max_iter=1000),
   'classifier__C': 10.0,
   'classifier__penalty': 'l1',
   'classifier__solver': 'saga'},
  {'classifier': LogisticRegression(max_iter=1000),
   'classifier__C': 10.0,
   'classifier__penalty': 'l2',
   'classifier__solver': 'saga'},
  {'classifier': LogisticRegression(max_iter=1000),
   'classifier__C': 100.0,
   'classifier__penalty': 'l1',
   'classifier__solver': 'saga'},
  {'classifier': LogisticRegression(max_iter=1000),
   'classifier__C': 100.0,
   'classifier__penalty': 'l2',
   'classifier__solver': 'saga'},
  {'classifier': LogisticRegression(max_iter=1000),
   'classifier__C': 1000.0,
   'classifier__penalty': 'l1',
   'classifier__solver': 'saga'},
  {'classifier': LogisticRegression(max_iter=1000),
   'classifier__C': 1000.0,
   'classifier__penalty': 'l2',
   'classifier__solver': 'saga'},
  {'classifier': LogisticRegression(max_iter=1000),
   'classifier__C': 10000.0,
   'classifier__penalty': 'l1',
   'classifier__solver': 'saga'},
  {'classifier': LogisticRegression(max_iter=1000),
   'classifier__C': 10000.0,
   'classifier__penalty': 'l2',
   'classifier__solver': 'saga'},
  {'classifier': LogisticRegression(max_iter=1000),
   'classifier__C': 0.0001,
   'classifier__penalty': 'l2',
   'classifier__solver': 'lbfgs'},
  {'classifier': LogisticRegression(max_iter=1000),
   'classifier__C': 0.001,
   'classifier__penalty': 'l2',
   'classifier__solver': 'lbfgs'},
  {'classifier': LogisticRegression(max_iter=1000),
   'classifier__C': 0.01,
   'classifier__penalty': 'l2',
   'classifier__solver': 'lbfgs'},
  {'classifier': LogisticRegression(max_iter=1000),
   'classifier__C': 0.1,
   'classifier__penalty': 'l2',
   'classifier__solver': 'lbfgs'},
  {'classifier': LogisticRegression(max_iter=1000),
   'classifier__C': 1.0,
   'classifier__penalty': 'l2',
   'classifier__solver': 'lbfgs'},
  {'classifier': LogisticRegression(max_iter=1000),
   'classifier__C': 10.0,
   'classifier__penalty': 'l2',
   'classifier__solver': 'lbfgs'},
  {'classifier': LogisticRegression(max_iter=1000),
   'classifier__C': 100.0,
   'classifier__penalty': 'l2',
   'classifier__solver': 'lbfgs'},
  {'classifier': LogisticRegression(max_iter=1000),
   'classifier__C': 1000.0,
   'classifier__penalty': 'l2',
   'classifier__solver': 'lbfgs'},
  {'classifier': LogisticRegression(max_iter=1000),
   'classifier__C': 10000.0,
   'classifier__penalty': 'l2',
   'classifier__solver': 'lbfgs'},
  {'classifier': LogisticRegression(max_iter=1000),
   'classifier__penalty': 'none',
   'classifier__solver': 'lbfgs'},
  {'classifier': LogisticRegression(max_iter=1000),
   'classifier__penalty': 'none',
   'classifier__solver': 'saga'}],
 'split0_test_accuracy': array([0.84237202, 0.84135834, 0.84490623, 0.86112519, 0.8803852 ,
        0.88089204, 0.88494678, 0.88443994, 0.88494678, 0.88494678,
        0.88494678, 0.88494678, 0.88494678, 0.88494678, 0.88494678,
        0.88494678, 0.88494678, 0.88494678, 0.84135834, 0.86112519,
        0.88089204, 0.88443994, 0.88494678, 0.88494678, 0.88494678,
        0.88494678, 0.88494678, 0.88494678, 0.88494678]),
 'split1_test_accuracy': array([0.84237202, 0.84338571, 0.84591992, 0.8504815 , 0.87328941,
        0.87278256, 0.87633046, 0.8768373 , 0.87734415, 0.87785099,
        0.87785099, 0.87785099, 0.87785099, 0.87785099, 0.87785099,
        0.87785099, 0.87785099, 0.87785099, 0.84338571, 0.85098834,
        0.87278256, 0.8768373 , 0.87785099, 0.87785099, 0.87785099,
        0.87785099, 0.87785099, 0.87785099, 0.87785099]),
 'split2_test_accuracy': array([0.84186518, 0.84237202, 0.84591992, 0.86822098, 0.87987836,
        0.88494678, 0.88900152, 0.88849468, 0.88798784, 0.88798784,
        0.88798784, 0.88798784, 0.88798784, 0.88798784, 0.88798784,
        0.88798784, 0.88798784, 0.88798784, 0.84237202, 0.86822098,
        0.88494678, 0.88849468, 0.88798784, 0.88798784, 0.88798784,
        0.88798784, 0.88798784, 0.88798784, 0.88798784]),
 'split3_test_accuracy': array([0.84186518, 0.84135834, 0.84642676, 0.86213887, 0.87987836,
        0.87937152, 0.88190573, 0.88139888, 0.88190573, 0.88139888,
        0.88139888, 0.88139888, 0.88139888, 0.88139888, 0.88139888,
        0.88139888, 0.88139888, 0.88139888, 0.84135834, 0.86213887,
        0.87937152, 0.88139888, 0.88139888, 0.88139888, 0.88139888,
        0.88139888, 0.88139888, 0.88139888, 0.88139888]),
 'split4_test_accuracy': array([0.84229209, 0.84279919, 0.84634888, 0.86359026, 0.87880325,
        0.87778905, 0.88184584, 0.88133874, 0.88336714, 0.88235294,
        0.88336714, 0.88286004, 0.88286004, 0.88286004, 0.88286004,
        0.88286004, 0.88286004, 0.88286004, 0.84279919, 0.86359026,
        0.87778905, 0.88133874, 0.88235294, 0.88336714, 0.88336714,
        0.88336714, 0.88336714, 0.88336714, 0.88286004]),
 'mean_test_accuracy': array([0.8421533 , 0.84225472, 0.84590434, 0.86111136, 0.87844691,
        0.87915639, 0.88280607, 0.88250191, 0.88311033, 0.88290749,
        0.88311033, 0.88300891, 0.88300891, 0.88300891, 0.88300891,
        0.88300891, 0.88300891, 0.88300891, 0.84225472, 0.86121273,
        0.87915639, 0.88250191, 0.88290749, 0.88311033, 0.88311033,
        0.88311033, 0.88311033, 0.88311033, 0.88300891]),
 'std_test_accuracy': array([0.00023705, 0.00079954, 0.00054171, 0.00584366, 0.0026299 ,
        0.003977  , 0.00416251, 0.00385556, 0.00351933, 0.00340941,
        0.00340054, 0.00339893, 0.00339893, 0.00339893, 0.00339893,
        0.00339893, 0.00339893, 0.00339893, 0.00079954, 0.00565989,
        0.003977  , 0.00385556, 0.00340941, 0.00340054, 0.00340054,
        0.00340054, 0.00340054, 0.00340054, 0.00339893]),
 'rank_test_accuracy': array([29, 27, 26, 25, 23, 21, 18, 19,  1, 16,  2,  8,  8,  8,  8,  8,  8,
         8, 27, 24, 21, 19, 16,  2,  2,  2,  2,  2,  8], dtype=int32),
 'split0_test_roc_auc_ovr': array([0.5       , 0.86433267, 0.85361746, 0.88670141, 0.89504374,
        0.89715448, 0.89539392, 0.89655086, 0.89543261, 0.89564349,
        0.89548098, 0.8955042 , 0.89549646, 0.89549646, 0.89548872,
        0.89549259, 0.89548872, 0.89549259, 0.86433074, 0.88669561,
        0.89714093, 0.89651603, 0.89556998, 0.89543455, 0.8954094 ,
        0.8954094 , 0.8954094 , 0.8954094 , 0.89548872]),
 'split1_test_roc_auc_ovr': array([0.5       , 0.85523098, 0.87450327, 0.87760166, 0.89933099,
        0.88952314, 0.89254124, 0.88946123, 0.88931613, 0.88903947,
        0.88897369, 0.88897176, 0.88896982, 0.88894661, 0.88895435,
        0.88896402, 0.8889408 , 0.88894274, 0.85522905, 0.87768291,
        0.8895154 , 0.88942834, 0.88902593, 0.88893693, 0.88893306,
        0.88893113, 0.88893113, 0.88893113, 0.88895048]),
 'split2_test_roc_auc_ovr': array([0.5       , 0.86442559, 0.86840064, 0.88605875, 0.89948324,
        0.89600604, 0.89713101, 0.89590762, 0.8956394 , 0.89551591,
        0.89547924, 0.89543293, 0.89543872, 0.89543486, 0.89543486,
        0.89543679, 0.89543679, 0.89543486, 0.86441401, 0.88606068,
        0.89600797, 0.89590184, 0.8955024 , 0.89542907, 0.89541171,
        0.89540592, 0.89540399, 0.89540399, 0.89543486]),
 'split3_test_roc_auc_ovr': array([0.5       , 0.83925539, 0.85631049, 0.86625102, 0.88893295,
        0.88252752, 0.88624689, 0.88466556, 0.88515375, 0.88489132,
        0.88495886, 0.88493377, 0.88493956, 0.8849357 , 0.8849357 ,
        0.8849357 , 0.8849357 , 0.8849357 , 0.83925539, 0.86624909,
        0.88253524, 0.8846617 , 0.88489132, 0.88491834, 0.88491641,
        0.88491641, 0.88491641, 0.88491641, 0.8849357 ]),
 'split4_test_roc_auc_ovr': array([0.5       , 0.86831045, 0.8616347 , 0.88697778, 0.90883344,
        0.89575683, 0.89799079, 0.89614012, 0.89586717, 0.89565423,
        0.8956039 , 0.89559615, 0.89559422, 0.89559228, 0.89559422,
        0.89559422, 0.89559422, 0.89559422, 0.86831433, 0.88697972,
        0.89577231, 0.89610721, 0.89562519, 0.89555937, 0.89555356,
        0.89555163, 0.89555356, 0.89555356, 0.89559422]),
 'mean_test_roc_auc_ovr': array([0.5       , 0.85831102, 0.86289331, 0.88071813, 0.89832487,
        0.8921936 , 0.89386077, 0.89254508, 0.89228181, 0.89214888,
        0.89209933, 0.89208776, 0.89208776, 0.89208118, 0.89208157,
        0.89208466, 0.89207925, 0.89208002, 0.8583087 , 0.8807336 ,
        0.89219437, 0.89252302, 0.89212296, 0.89205565, 0.89204483,
        0.8920429 , 0.8920429 , 0.8920429 , 0.8920808 ]),
 'std_test_roc_auc_ovr': array([0.        , 0.01045218, 0.0076952 , 0.00803137, 0.00650781,
        0.00552098, 0.00423826, 0.00473021, 0.00432805, 0.00443119,
        0.00437944, 0.00438315, 0.00438091, 0.00438458, 0.00438258,
        0.00438209, 0.00438481, 0.00438484, 0.01045146, 0.00802552,
        0.00551887, 0.00472408, 0.00441489, 0.00437586, 0.00436955,
        0.00436862, 0.00436864, 0.00436864, 0.00438313]),
 'rank_test_roc_auc_ovr': array([29, 27, 26, 25,  1,  7,  2,  3,  5,  8, 10, 11, 12, 15, 14, 13, 18,
        17, 28, 24,  6,  4,  9, 19, 20, 23, 21, 21, 16], dtype=int32),
 'split0_test_f1_micro': array([0.84237202, 0.84135834, 0.84490623, 0.86112519, 0.8803852 ,
        0.88089204, 0.88494678, 0.88443994, 0.88494678, 0.88494678,
        0.88494678, 0.88494678, 0.88494678, 0.88494678, 0.88494678,
        0.88494678, 0.88494678, 0.88494678, 0.84135834, 0.86112519,
        0.88089204, 0.88443994, 0.88494678, 0.88494678, 0.88494678,
        0.88494678, 0.88494678, 0.88494678, 0.88494678]),
 'split1_test_f1_micro': array([0.84237202, 0.84338571, 0.84591992, 0.8504815 , 0.87328941,
        0.87278256, 0.87633046, 0.8768373 , 0.87734415, 0.87785099,
        0.87785099, 0.87785099, 0.87785099, 0.87785099, 0.87785099,
        0.87785099, 0.87785099, 0.87785099, 0.84338571, 0.85098834,
        0.87278256, 0.8768373 , 0.87785099, 0.87785099, 0.87785099,
        0.87785099, 0.87785099, 0.87785099, 0.87785099]),
 'split2_test_f1_micro': array([0.84186518, 0.84237202, 0.84591992, 0.86822098, 0.87987836,
        0.88494678, 0.88900152, 0.88849468, 0.88798784, 0.88798784,
        0.88798784, 0.88798784, 0.88798784, 0.88798784, 0.88798784,
        0.88798784, 0.88798784, 0.88798784, 0.84237202, 0.86822098,
        0.88494678, 0.88849468, 0.88798784, 0.88798784, 0.88798784,
        0.88798784, 0.88798784, 0.88798784, 0.88798784]),
 'split3_test_f1_micro': array([0.84186518, 0.84135834, 0.84642676, 0.86213887, 0.87987836,
        0.87937152, 0.88190573, 0.88139888, 0.88190573, 0.88139888,
        0.88139888, 0.88139888, 0.88139888, 0.88139888, 0.88139888,
        0.88139888, 0.88139888, 0.88139888, 0.84135834, 0.86213887,
        0.87937152, 0.88139888, 0.88139888, 0.88139888, 0.88139888,
        0.88139888, 0.88139888, 0.88139888, 0.88139888]),
 'split4_test_f1_micro': array([0.84229209, 0.84279919, 0.84634888, 0.86359026, 0.87880325,
        0.87778905, 0.88184584, 0.88133874, 0.88336714, 0.88235294,
        0.88336714, 0.88286004, 0.88286004, 0.88286004, 0.88286004,
        0.88286004, 0.88286004, 0.88286004, 0.84279919, 0.86359026,
        0.87778905, 0.88133874, 0.88235294, 0.88336714, 0.88336714,
        0.88336714, 0.88336714, 0.88336714, 0.88286004]),
 'mean_test_f1_micro': array([0.8421533 , 0.84225472, 0.84590434, 0.86111136, 0.87844691,
        0.87915639, 0.88280607, 0.88250191, 0.88311033, 0.88290749,
        0.88311033, 0.88300891, 0.88300891, 0.88300891, 0.88300891,
        0.88300891, 0.88300891, 0.88300891, 0.84225472, 0.86121273,
        0.87915639, 0.88250191, 0.88290749, 0.88311033, 0.88311033,
        0.88311033, 0.88311033, 0.88311033, 0.88300891]),
 'std_test_f1_micro': array([0.00023705, 0.00079954, 0.00054171, 0.00584366, 0.0026299 ,
        0.003977  , 0.00416251, 0.00385556, 0.00351933, 0.00340941,
        0.00340054, 0.00339893, 0.00339893, 0.00339893, 0.00339893,
        0.00339893, 0.00339893, 0.00339893, 0.00079954, 0.00565989,
        0.003977  , 0.00385556, 0.00340941, 0.00340054, 0.00340054,
        0.00340054, 0.00340054, 0.00340054, 0.00339893]),
 'rank_test_f1_micro': array([29, 27, 26, 25, 23, 21, 18, 19,  1, 16,  2,  8,  8,  8,  8,  8,  8,
         8, 27, 24, 21, 19, 16,  2,  2,  2,  2,  2,  8], dtype=int32)}

# get the model with highest accuracy from grid search
p_accu = best_model.cv_results_['params'][np.argmin(best_model.cv_results_['rank_test_accuracy'])]
p_accu

{'classifier': LogisticRegression(max_iter=1000),
 'classifier__C': 1.0,
 'classifier__penalty': 'l1',
 'classifier__solver': 'saga'}

# set the selected parameter to the pipeline
pipe.set_params(**p_accu)

Pipeline(steps=[('std', StandardScaler()),
                ('classifier',
                 LogisticRegression(max_iter=1000, penalty='l1',
                                    solver='saga'))])

# train on the entire training set with the model with highest accuracy from grid search
clf = pipe.fit(X_train, y_train)
y_pred = clf.predict(X_test)
score1 = clf.score(X_test, y_test)
print("The accuracy score for this model is", score1)

The accuracy score for this model is 0.8909164639091647

print(classification_report(y_test, y_pred))

cm2 = metrics.confusion_matrix(y_test, y_pred)

fig, ax = plt.subplots(figsize=(7.5, 7.5))
ax.matshow(cm2, cmap=plt.cm.Reds, alpha=0.3)
for i in range(cm2.shape[0]):
    for j in range(cm2.shape[1]):
        ax.text(x=j, y=i,s=cm2[i, j], va='center', ha='center', size='xx-large')

plt.xlabel('Predictions', fontsize=18)
plt.ylabel('Actuals', fontsize=18)
plt.title('Confusion Matrix', fontsize=18)
plt.show()

              precision    recall  f1-score   support

           0       0.91      0.97      0.94      2115
           1       0.71      0.39      0.50       351

    accuracy                           0.89      2466
   macro avg       0.81      0.68      0.72      2466
weighted avg       0.88      0.89      0.88      2466

fpr = roc_curve(y_test, clf.predict_proba(X_test)[:,1])[0]  # false positive rate
tpr = roc_curve(y_test, clf.predict_proba(X_test)[:,1])[1]  # true positive rate
roc_auc = roc_auc_score(y_test, clf.predict_proba(X_test)[:,1])

# plotting the ROC curve
plt.figure(dpi=100)
plt.plot(fpr, tpr)
plt.title('ROC curve')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')

print('Area under the Receiver Operating Characteristic curve:', 
      roc_auc)

Area under the Receiver Operating Characteristic curve: 0.9066874111791369

# get the model with largest Area Under the Receiver Operating Characteristic Curve from grid search
p_rao = best_model.cv_results_['params'][np.argmin(best_model.cv_results_['rank_test_roc_auc_ovr'])]
p_rao

{'classifier': LogisticRegression(max_iter=1000, penalty='l1', solver='saga'),
 'classifier__C': 0.01,
 'classifier__penalty': 'l1',
 'classifier__solver': 'saga'}

# set the selected parameter to the pipeline
pipe.set_params(**p_rao)
clf = pipe.fit(X_train, y_train)
y_pred = clf.predict(X_test)
score = clf.score(X_test, y_test)
print("The accuracy score for this model is", score)

The accuracy score for this model is 0.8880778588807786

print(classification_report(y_test, y_pred))

cm3 = metrics.confusion_matrix(y_test, y_pred)

fig, ax = plt.subplots(figsize=(7.5, 7.5))
ax.matshow(cm3, cmap=plt.cm.Reds, alpha=0.3)
for i in range(cm3.shape[0]):
    for j in range(cm3.shape[1]):
        ax.text(x=j, y=i,s=cm3[i, j], va='center', ha='center', size='xx-large')

plt.xlabel('Predictions', fontsize=18)
plt.ylabel('Actuals', fontsize=18)
plt.title('Confusion Matrix', fontsize=18)
plt.show()

              precision    recall  f1-score   support

           0       0.90      0.98      0.94      2115
           1       0.74      0.33      0.46       351

    accuracy                           0.89      2466
   macro avg       0.82      0.66      0.70      2466
weighted avg       0.88      0.89      0.87      2466

fpr = roc_curve(y_test, clf.predict_proba(X_test)[:,1])[0]  # false positive rate
tpr = roc_curve(y_test, clf.predict_proba(X_test)[:,1])[1]  # true positive rate
roc_auc1 = roc_auc_score(y_test, clf.predict_proba(X_test)[:,1])

# plotting the ROC curve
plt.figure(dpi=100)
plt.plot(fpr, tpr)
plt.title('ROC curve')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')

print('Area under the Receiver Operating Characteristic curve:', roc_auc1)

Area under the Receiver Operating Characteristic curve: 0.9098516228539869

from sklearn.neighbors import KNeighborsClassifier
from sklearn.feature_selection import VarianceThreshold

# introduce the knn model
knn = KNeighborsClassifier(n_neighbors=7)
knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)
# print out original accuracy of KNN
print("The accuracy score for this model is:", metrics.accuracy_score(y_test, y_pred))

The accuracy score for this model is: 0.8742903487429035

# print out stats and visualization of confusion matrix
print(classification_report(y_test, y_pred))

cm4 = metrics.confusion_matrix(y_test, y_pred)

fig, ax = plt.subplots(figsize=(7.5, 7.5))
ax.matshow(cm4, cmap=plt.cm.Blues, alpha=0.3)
for i in range(cm4.shape[0]):
    for j in range(cm4.shape[1]):
        ax.text(x=j, y=i,s=cm4[i, j], va='center', ha='center', size='xx-large')

plt.xlabel('Predictions', fontsize=18)
plt.ylabel('Actuals', fontsize=18)
plt.title('Confusion Matrix', fontsize=18)
plt.show()

              precision    recall  f1-score   support

           0       0.89      0.97      0.93      2115
           1       0.62      0.29      0.40       351

    accuracy                           0.87      2466
   macro avg       0.76      0.63      0.66      2466
weighted avg       0.85      0.87      0.85      2466

fpr = roc_curve(y_test, knn.predict_proba(X_test)[:,1])[0] # false positiv 
tpr = roc_curve(y_test, knn.predict_proba(X_test)[:,1])[1] # true positive 
roc_auc = roc_auc_score(y_test, knn.predict_proba(X_test)[:,1])

# plotting the ROC curve
plt.figure(dpi=100)
plt.plot(fpr, tpr)
plt.title('ROC curve') 
plt.xlabel('False Positive Rate') 
plt.ylabel('True Positive Rate')
print('Area under the Receiver Operating Characteristic curve:', roc_auc)

Area under the Receiver Operating Characteristic curve: 0.7985344136644373

# finding the Permutation importance

perm = PermutationImportance(knn).fit(X_test, y_test)
eli5.show_weights(perm, feature_names = X_test.columns.tolist())

# create a pipeline 
pipe = Pipeline([
('scaler', StandardScaler()),
('selector', VarianceThreshold()),
('classifier', KNeighborsClassifier())
])

pipe.fit(X_train, y_train)
print('Training set score: ' + str(pipe.score(X_train,y_train)))
print('Test set score: ' + str(pipe.score(X_test,y_test)))

Training set score: 0.9073398215733982
Test set score: 0.8896999188969992

# setting up parameters - number of neighbors
parameters = {
 'classifier__n_neighbors': [1, 3, 5, 7, 10]
}
# create grid search
grid = GridSearchCV(pipe, parameters,scoring=['accuracy', 'roc_auc_ovr', 'f1_micro'],
                    cv=5, n_jobs = -1, refit=False,verbose=0).fit(X_train, y_train)

# fit grid search
best_model = grid.fit(X_train, y_train)

best_model.cv_results_

{'mean_fit_time': array([0.07479305, 0.06789064, 0.0694169 , 0.07223959, 0.06771693]),
 'std_fit_time': array([0.00614116, 0.00316065, 0.0044747 , 0.0065796 , 0.00418125]),
 'mean_score_time': array([0.99627099, 1.18802872, 1.33055763, 1.4577014 , 1.40610828]),
 'std_score_time': array([0.02603629, 0.02281447, 0.03071443, 0.030865  , 0.26218933]),
 'param_classifier__n_neighbors': masked_array(data=[1, 3, 5, 7, 10],
              mask=[False, False, False, False, False],
        fill_value='?',
             dtype=object),
 'params': [{'classifier__n_neighbors': 1},
  {'classifier__n_neighbors': 3},
  {'classifier__n_neighbors': 5},
  {'classifier__n_neighbors': 7},
  {'classifier__n_neighbors': 10}],
 'split0_test_accuracy': array([0.84946782, 0.87176888, 0.87582362, 0.8768373 , 0.87633046]),
 'split1_test_accuracy': array([0.85352255, 0.86416624, 0.86568677, 0.86771414, 0.87328941]),
 'split2_test_accuracy': array([0.85605677, 0.86872783, 0.8803852 , 0.88139888, 0.88089204]),
 'split3_test_accuracy': array([0.85656361, 0.87633046, 0.88545362, 0.88443994, 0.88190573]),
 'split4_test_accuracy': array([0.84888438, 0.87068966, 0.87931034, 0.87880325, 0.87829615]),
 'mean_test_accuracy': array([0.85289903, 0.87033661, 0.87733191, 0.8778387 , 0.87814276]),
 'std_test_accuracy': array([0.00321496, 0.00396898, 0.00658913, 0.00566906, 0.0031171 ]),
 'rank_test_accuracy': array([5, 4, 3, 2, 1], dtype=int32),
 'split0_test_roc_auc_ovr': array([0.70416362, 0.78092969, 0.81000112, 0.83122647, 0.84536703]),
 'split1_test_roc_auc_ovr': array([0.70787723, 0.7742744 , 0.80519635, 0.8183241 , 0.84252886]),
 'split2_test_roc_auc_ovr': array([0.72969056, 0.79579609, 0.83306415, 0.84342244, 0.85472723]),
 'split3_test_roc_auc_ovr': array([0.7143731 , 0.8015165 , 0.82774028, 0.84099496, 0.8544812 ]),
 'split4_test_roc_auc_ovr': array([0.70906419, 0.77907393, 0.81852543, 0.82607812, 0.83678429]),
 'mean_test_roc_auc_ovr': array([0.71303374, 0.78631812, 0.81890547, 0.83200922, 0.84677772]),
 'std_test_roc_auc_ovr': array([0.00894728, 0.01046315, 0.01044886, 0.00931749, 0.00696347]),
 'rank_test_roc_auc_ovr': array([5, 4, 3, 2, 1], dtype=int32),
 'split0_test_f1_micro': array([0.84946782, 0.87176888, 0.87582362, 0.8768373 , 0.87633046]),
 'split1_test_f1_micro': array([0.85352255, 0.86416624, 0.86568677, 0.86771414, 0.87328941]),
 'split2_test_f1_micro': array([0.85605677, 0.86872783, 0.8803852 , 0.88139888, 0.88089204]),
 'split3_test_f1_micro': array([0.85656361, 0.87633046, 0.88545362, 0.88443994, 0.88190573]),
 'split4_test_f1_micro': array([0.84888438, 0.87068966, 0.87931034, 0.87880325, 0.87829615]),
 'mean_test_f1_micro': array([0.85289903, 0.87033661, 0.87733191, 0.8778387 , 0.87814276]),
 'std_test_f1_micro': array([0.00321496, 0.00396898, 0.00658913, 0.00566906, 0.0031171 ]),
 'rank_test_f1_micro': array([5, 4, 3, 2, 1], dtype=int32)}

# get the model with highest accuracy from grid search
p_accu = best_model.cv_results_['params'][ np.argmin(best_model.cv_results_['rank_test_accuracy']) ]
p_accu

{'classifier__n_neighbors': 10}

# set the selected parameter to the pipeline
pipe.set_params(**p_accu)

Pipeline(steps=[('scaler', StandardScaler()), ('selector', VarianceThreshold()),
                ('classifier', KNeighborsClassifier(n_neighbors=10))])

# train on the entire training set with the model with highest accuracy from grid search
clf = pipe.fit(X_train, y_train)
y_pred = clf.predict(X_test)
score2 = clf.score(X_test, y_test)
print("The accuracy score for the optimized model is", score2)

The accuracy score for the optimized model is 0.8896999188969992

# print out stats and visualization of confusion matrix after optimization
cm5 = metrics.confusion_matrix(y_test, y_pred)

fig, ax = plt.subplots(figsize=(7.5, 7.5))
ax.matshow(cm5, cmap=plt.cm.Blues, alpha=0.3)
for i in range(cm5.shape[0]):
    for j in range(cm5.shape[1]):
        ax.text(x=j, y=i,s=cm5[i, j], va='center', ha='center', size='xx-large')

plt.xlabel('Predictions', fontsize=18)
plt.ylabel('Actuals', fontsize=18)
plt.title('Confusion Matrix', fontsize=18)
plt.show()

fpr = roc_curve(y_test, clf.predict_proba(X_test)[:,1])[0] # false positiv 
tpr = roc_curve(y_test, clf.predict_proba(X_test)[:,1])[1] # true positive 
roc_auc2 = roc_auc_score(y_test, clf.predict_proba(X_test)[:,1])
# plotting the ROC curve after optimization
plt.figure(dpi=100)
plt.plot(fpr, tpr)
plt.title('ROC curve') 
plt.xlabel('False Positive Rate') 
plt.ylabel('True Positive Rate')
print('Area under the Receiver Operating Characteristic curve:', roc_auc2)

Area under the Receiver Operating Characteristic curve: 0.865143157341739

# use a linear kernel and C = 1 our SVM model and set random_state to 0
from sklearn import metrics
from sklearn.svm import SVC
svm = SVC(kernel = 'linear', C = 1, random_state = 0, probability = True)
svm.fit(X_train,y_train)
y_pred = svm.predict(X_test)

accuracy = metrics.accuracy_score(y_pred,y_test)

print("The accuracy score for this model is:", accuracy)

The accuracy score for this model is: 0.8925385239253852

# print out stats and visualization of confusion matrix
print(classification_report(y_test, y_pred))

cm6 = metrics.confusion_matrix(y_test, y_pred)

fig, ax = plt.subplots(figsize=(7.5, 7.5))
ax.matshow(cm6, cmap=plt.cm.Blues, alpha=0.3)
for i in range(cm6.shape[0]):
    for j in range(cm6.shape[1]):
        ax.text(x=j, y=i,s=cm6[i, j], va='center', ha='center', size='xx-large')

plt.xlabel('Predictions', fontsize=18)
plt.ylabel('Actuals', fontsize=18)
plt.title('Confusion Matrix', fontsize=18)
plt.show()

              precision    recall  f1-score   support

           0       0.90      0.98      0.94      2115
           1       0.74      0.38      0.50       351

    accuracy                           0.89      2466
   macro avg       0.82      0.68      0.72      2466
weighted avg       0.88      0.89      0.88      2466

fpr = roc_curve(y_test, svm.predict_proba(X_test)[:,1])[0] # false positive
tpr = roc_curve(y_test, svm.predict_proba(X_test)[:,1])[1] # true positive 
roc_auc = roc_auc_score(y_test, svm.predict_proba(X_test)[:,1])

# plotting the ROC curve
plt.figure(dpi=100)
plt.plot(fpr, tpr)
plt.title('ROC curve') 
plt.xlabel('False Positive Rate') 
plt.ylabel('True Positive Rate')
print('Area under the Receiver Operating Characteristic curve:', roc_auc)

Area under the Receiver Operating Characteristic curve: 0.675860257420541

# finding the Permutation importance

perm = PermutationImportance(svm).fit(X_test, y_test)
eli5.show_weights(perm, feature_names = X_test.columns.tolist())

# create a pipeline 
# scaler to run in parallel
pipe = Pipeline([('scaler', StandardScaler()),("classifier", SVC())])

# FIT THE MODEL
pipe.fit(X_train, y_train)
print('Training set score: ' + str(pipe.score(X_train,y_train)))
print('Test set score: ' + str(pipe.score(X_test,y_test)))

Training set score: 0.9011557177615572
Test set score: 0.8994322789943228

parameters = {
 'classifier__C': [0.001, 0.01,0.1, 1, 3,5],
 'classifier__kernel':['linear', 'rbf'],
 'classifier__probability': [True],
}
# create grid search
grid = GridSearchCV(pipe, parameters,scoring=['accuracy', 'roc_auc_ovr', 'f1_micro'],
                    cv=5, n_jobs = -1, refit=False,verbose=0).fit(X_train, y_train)

best_model = grid.fit(X_train, y_train)

best_model.cv_results_

{'mean_fit_time': array([ 8.83730559, 10.71986699,  7.94916902, 11.71108265,  9.90437055,
        10.5455781 , 26.57819452,  9.34472299, 56.89408188,  9.67510219,
        81.97286243,  8.83459206]),
 'std_fit_time': array([ 0.12247692,  0.09365903,  0.2559104 ,  0.41042347,  0.50441932,
         0.33550414,  2.96277921,  0.49890039,  5.62349283,  0.65296627,
        11.45592228,  0.93956423]),
 'mean_score_time': array([0.25707002, 0.49590178, 0.26461873, 0.53437381, 0.23703604,
        0.46670737, 0.22173724, 0.43391843, 0.27389264, 0.40725298,
        0.19027405, 0.37588048]),
 'std_score_time': array([0.01880509, 0.02473279, 0.02787563, 0.01641703, 0.02177347,
        0.01198304, 0.00966425, 0.02484511, 0.09350725, 0.01028193,
        0.03154679, 0.06207264]),
 'param_classifier__C': masked_array(data=[0.001, 0.001, 0.01, 0.01, 0.1, 0.1, 1, 1, 3, 3, 5, 5],
              mask=[False, False, False, False, False, False, False, False,
                    False, False, False, False],
        fill_value='?',
             dtype=object),
 'param_classifier__kernel': masked_array(data=['linear', 'rbf', 'linear', 'rbf', 'linear', 'rbf',
                    'linear', 'rbf', 'linear', 'rbf', 'linear', 'rbf'],
              mask=[False, False, False, False, False, False, False, False,
                    False, False, False, False],
        fill_value='?',
             dtype=object),
 'param_classifier__probability': masked_array(data=[True, True, True, True, True, True, True, True, True,
                    True, True, True],
              mask=[False, False, False, False, False, False, False, False,
                    False, False, False, False],
        fill_value='?',
             dtype=object),
 'params': [{'classifier__C': 0.001,
   'classifier__kernel': 'linear',
   'classifier__probability': True},
  {'classifier__C': 0.001,
   'classifier__kernel': 'rbf',
   'classifier__probability': True},
  {'classifier__C': 0.01,
   'classifier__kernel': 'linear',
   'classifier__probability': True},
  {'classifier__C': 0.01,
   'classifier__kernel': 'rbf',
   'classifier__probability': True},
  {'classifier__C': 0.1,
   'classifier__kernel': 'linear',
   'classifier__probability': True},
  {'classifier__C': 0.1,
   'classifier__kernel': 'rbf',
   'classifier__probability': True},
  {'classifier__C': 1,
   'classifier__kernel': 'linear',
   'classifier__probability': True},
  {'classifier__C': 1,
   'classifier__kernel': 'rbf',
   'classifier__probability': True},
  {'classifier__C': 3,
   'classifier__kernel': 'linear',
   'classifier__probability': True},
  {'classifier__C': 3,
   'classifier__kernel': 'rbf',
   'classifier__probability': True},
  {'classifier__C': 5,
   'classifier__kernel': 'linear',
   'classifier__probability': True},
  {'classifier__C': 5,
   'classifier__kernel': 'rbf',
   'classifier__probability': True}],
 'split0_test_accuracy': array([0.87582362, 0.84237202, 0.88748099, 0.84237202, 0.88697415,
        0.88748099, 0.88697415, 0.89406994, 0.88697415, 0.89812468,
        0.88697415, 0.89660416]),
 'split1_test_accuracy': array([0.86670046, 0.84237202, 0.87531678, 0.84237202, 0.87835783,
        0.87176888, 0.87886467, 0.89001521, 0.87886467, 0.88900152,
        0.87886467, 0.89254942]),
 'split2_test_accuracy': array([0.87582362, 0.84186518, 0.88697415, 0.84186518, 0.88900152,
        0.87886467, 0.88900152, 0.8935631 , 0.88900152, 0.90217942,
        0.88900152, 0.90369995]),
 'split3_test_accuracy': array([0.87835783, 0.84186518, 0.88596047, 0.84186518, 0.88596047,
        0.88291941, 0.88545362, 0.89508363, 0.88545362, 0.89609731,
        0.88545362, 0.89305626]),
 'split4_test_accuracy': array([0.87576065, 0.84229209, 0.88133874, 0.84229209, 0.88286004,
        0.87576065, 0.88235294, 0.88691684, 0.88235294, 0.88945233,
        0.88235294, 0.89401623]),
 'mean_test_accuracy': array([0.87449323, 0.8421533 , 0.88341423, 0.8421533 , 0.8846308 ,
        0.87935892, 0.88452938, 0.89192974, 0.88452938, 0.89497105,
        0.88452938, 0.8959852 ]),
 'std_test_accuracy': array([0.00402017, 0.00023705, 0.00459503, 0.00023705, 0.00371075,
        0.0054671 , 0.00356811, 0.00303282, 0.00356811, 0.0050846 ,
        0.00356811, 0.0041027 ]),
 'rank_test_accuracy': array([10, 11,  8, 11,  4,  9,  5,  3,  5,  2,  5,  1], dtype=int32),
 'split0_test_roc_auc_ovr': array([0.86017118, 0.85514779, 0.87618451, 0.8790575 , 0.87417728,
        0.88090125, 0.87342856, 0.87514945, 0.87322445, 0.8699916 ,
        0.87348466, 0.86704122]),
 'split1_test_roc_auc_ovr': array([0.88095832, 0.88080355, 0.88839135, 0.8886951 , 0.88803731,
        0.89160292, 0.88865254, 0.88748206, 0.8885945 , 0.88166642,
        0.88869897, 0.88050174]),
 'split2_test_roc_auc_ovr': array([0.86786613, 0.88119414, 0.88368723, 0.89329297, 0.88328779,
        0.89331516, 0.88384449, 0.89005696, 0.88381844, 0.88423139,
        0.88399887, 0.88537566]),
 'split3_test_roc_auc_ovr': array([0.87050491, 0.87895865, 0.88083619, 0.88437418, 0.87930502,
        0.8833727 , 0.87853124, 0.8654618 , 0.87856597, 0.86070717,
        0.87889787, 0.85629506]),
 'split4_test_roc_auc_ovr': array([0.87069154, 0.87121422, 0.88257471, 0.88821866, 0.88105507,
        0.8832358 , 0.88105217, 0.86695246, 0.88091472, 0.85729551,
        0.88089537, 0.86070259]),
 'mean_test_roc_auc_ovr': array([0.87003842, 0.87346367, 0.8823348 , 0.88672768, 0.8811725 ,
        0.88648556, 0.8811018 , 0.87702055, 0.88102362, 0.87077842,
        0.88119515, 0.86998325]),
 'std_test_roc_auc_ovr': array([0.00666288, 0.00984236, 0.00396586, 0.00476586, 0.00456146,
        0.0049853 , 0.00509951, 0.01017706, 0.00513825, 0.01080156,
        0.00507912, 0.01122246]),
 'rank_test_roc_auc_ovr': array([11,  9,  3,  1,  5,  2,  6,  8,  7, 10,  4, 12], dtype=int32),
 'split0_test_f1_micro': array([0.87582362, 0.84237202, 0.88748099, 0.84237202, 0.88697415,
        0.88748099, 0.88697415, 0.89406994, 0.88697415, 0.89812468,
        0.88697415, 0.89660416]),
 'split1_test_f1_micro': array([0.86670046, 0.84237202, 0.87531678, 0.84237202, 0.87835783,
        0.87176888, 0.87886467, 0.89001521, 0.87886467, 0.88900152,
        0.87886467, 0.89254942]),
 'split2_test_f1_micro': array([0.87582362, 0.84186518, 0.88697415, 0.84186518, 0.88900152,
        0.87886467, 0.88900152, 0.8935631 , 0.88900152, 0.90217942,
        0.88900152, 0.90369995]),
 'split3_test_f1_micro': array([0.87835783, 0.84186518, 0.88596047, 0.84186518, 0.88596047,
        0.88291941, 0.88545362, 0.89508363, 0.88545362, 0.89609731,
        0.88545362, 0.89305626]),
 'split4_test_f1_micro': array([0.87576065, 0.84229209, 0.88133874, 0.84229209, 0.88286004,
        0.87576065, 0.88235294, 0.88691684, 0.88235294, 0.88945233,
        0.88235294, 0.89401623]),
 'mean_test_f1_micro': array([0.87449323, 0.8421533 , 0.88341423, 0.8421533 , 0.8846308 ,
        0.87935892, 0.88452938, 0.89192974, 0.88452938, 0.89497105,
        0.88452938, 0.8959852 ]),
 'std_test_f1_micro': array([0.00402017, 0.00023705, 0.00459503, 0.00023705, 0.00371075,
        0.0054671 , 0.00356811, 0.00303282, 0.00356811, 0.0050846 ,
        0.00356811, 0.0041027 ]),
 'rank_test_f1_micro': array([10, 11,  8, 11,  4,  9,  5,  3,  5,  2,  5,  1], dtype=int32)}

# get the model with highest accuracy from grid search
p_accu = best_model.cv_results_['params'][ np.argmin(best_model.cv_results_['rank_test_accuracy']) ]
p_accu

{'classifier__C': 5,
 'classifier__kernel': 'rbf',
 'classifier__probability': True}

# set the selected parameter to the pipeline
pipe.set_params(**p_accu)

Pipeline(steps=[('scaler', StandardScaler()),
                ('classifier', SVC(C=5, probability=True))])

# train on the entire training set with the model with highest accuracy from grid search
clf = pipe.fit(X_train, y_train)
y_pred = clf.predict(X_test)
score3 = clf.score(X_test, y_test)
print("The accuracy score for the optimized model is", score3)

The accuracy score for the optimized model is 0.9038929440389294

# print out stats and visualization of confusion matrix after optimization
cm7 = metrics.confusion_matrix(y_test, y_pred)

fig, ax = plt.subplots(figsize=(7.5, 7.5))
ax.matshow(cm7, cmap=plt.cm.Blues, alpha=0.3)
for i in range(cm7.shape[0]):
    for j in range(cm7.shape[1]):
        ax.text(x=j, y=i,s=cm7[i, j], va='center', ha='center', size='xx-large')

plt.xlabel('Predictions', fontsize=18)
plt.ylabel('Actuals', fontsize=18)
plt.title('Confusion Matrix', fontsize=18)
plt.show()

fpr = roc_curve(y_test, clf.predict_proba(X_test)[:,1])[0] # false positive 
tpr = roc_curve(y_test, clf.predict_proba(X_test)[:,1])[1] # true positive 
roc_auc3 = roc_auc_score(y_test, clf.predict_proba(X_test)[:,1])
# plotting the ROC curve after optimization
plt.figure(dpi=100)
plt.plot(fpr, tpr)
plt.title('ROC curve') 
plt.xlabel('False Positive Rate') 
plt.ylabel('True Positive Rate')
print('Area under the Receiver Operating Characteristic curve:', roc_auc3)

Area under the Receiver Operating Characteristic curve: 0.8649464885871506

from sklearn.ensemble import RandomForestClassifier

# setup model
rf = RandomForestClassifier()
rf.fit(X_train, y_train)

y_pred = rf.predict(X_test)

# evaluating the model
accuracy = metrics.accuracy_score(y_pred,y_test)
print("The accuracy score for this model is:", accuracy)

The accuracy score for this model is: 0.9087591240875912

# confusion matrix
cm8 = metrics.confusion_matrix(y_test, y_pred)

fig, ax = plt.subplots(figsize=(7.5, 7.5))
ax.matshow(cm8, cmap=plt.cm.Blues, alpha=0.3)
for i in range(cm8.shape[0]):
    for j in range(cm8.shape[1]):
        ax.text(x=j, y=i,s=cm8[i, j], va='center', ha='center', size='xx-large')

plt.xlabel('Predictions', fontsize=18)
plt.ylabel('Actuals', fontsize=18)
plt.title('Confusion Matrix', fontsize=18)
plt.show()

# classification report
cr = classification_report(y_test, y_pred)
print(cr)

              precision    recall  f1-score   support

           0       0.93      0.97      0.95      2115
           1       0.74      0.58      0.65       351

    accuracy                           0.91      2466
   macro avg       0.84      0.77      0.80      2466
weighted avg       0.91      0.91      0.91      2466

fpr = roc_curve(y_test, rf.predict_proba(X_test)[:,1])[0] # false positive
tpr = roc_curve(y_test, rf.predict_proba(X_test)[:,1])[1] # true positive 
roc_auc = roc_auc_score(y_test, rf.predict_proba(X_test)[:,1])

# plotting the ROC curve
plt.figure(dpi=100)
plt.plot(fpr, tpr)
plt.title('ROC curve') 
plt.xlabel('False Positive Rate') 
plt.ylabel('True Positive Rate')
print('Area under the Receiver Operating Characteristic curve:', roc_auc)

Area under the Receiver Operating Characteristic curve: 0.9300297023701278

# finding the Permutation importance

import eli5
from eli5.sklearn import PermutationImportance

perm = PermutationImportance(rf).fit(X_test, y_test)
eli5.show_weights(perm, feature_names = X_test.columns.tolist())

# create a pipeline 
# scaler to run in parallel
pipe = Pipeline([('scaler', StandardScaler()),("classifier", RandomForestClassifier())])

# FIT THE MODEL
pipe.fit(X_train, y_train)
print('Training set score: ' + str(pipe.score(X_train,y_train)))
print('Test set score: ' + str(pipe.score(X_test,y_test)))

Training set score: 0.9997972424979724
Test set score: 0.9120032441200324

param_grid = { 
    'classifier__n_estimators': [200, 500],
    'classifier__max_features': ['auto', 'sqrt', 'log2'],
    'classifier__max_depth' : [10, 1000, 10000, 100000],
    'classifier__criterion' :['gini', 'entropy']
}

# create grid search
grid = GridSearchCV(pipe, param_grid, scoring=['accuracy', 'roc_auc_ovr', 'f1_micro'],
                    cv=5, refit=False, verbose=0).fit(X_train, y_train)

best_model = grid.fit(X_train, y_train)

# get the model with highest accuracy from grid search
p_accu = best_model.cv_results_['params'][ np.argmin(best_model.cv_results_['rank_test_accuracy']) ]
p_accu

{'classifier__criterion': 'gini',
 'classifier__max_depth': 10,
 'classifier__max_features': 'sqrt',
 'classifier__n_estimators': 200}

# set the selected parameter to the pipeline
pipe.set_params(**p_accu)

Pipeline(steps=[('scaler', StandardScaler()),
                ('classifier',
                 RandomForestClassifier(max_depth=10, max_features='sqrt',
                                        n_estimators=200))])

# train on the entire training set with the model with highest accuracy from grid search
clf = pipe.fit(X_train, y_train)
y_pred = clf.predict(X_test)
score4 = clf.score(X_test, y_test)
print("The accuracy score for the optimized model is", score4)

The accuracy score for the optimized model is 0.9103811841038119

# print out stats and visualization of confusion matrix after optimization
cm9 = metrics.confusion_matrix(y_test, y_pred)

fig, ax = plt.subplots(figsize=(7.5, 7.5))
ax.matshow(cm9, cmap=plt.cm.Blues, alpha=0.3)
for i in range(cm9.shape[0]):
    for j in range(cm9.shape[1]):
        ax.text(x=j, y=i,s=cm9[i, j], va='center', ha='center', size='xx-large')

plt.xlabel('Predictions', fontsize=18)
plt.ylabel('Actuals', fontsize=18)
plt.title('Confusion Matrix', fontsize=18)
plt.show()

fpr = roc_curve(y_test, clf.predict_proba(X_test)[:,1])[0] # false positive 
tpr = roc_curve(y_test, clf.predict_proba(X_test)[:,1])[1] # true positive 
roc_auc4 = roc_auc_score(y_test, clf.predict_proba(X_test)[:,1])
# plotting the ROC curve after optimization
plt.figure(dpi=100)
plt.plot(fpr, tpr)
plt.title('ROC curve') 
plt.xlabel('False Positive Rate') 
plt.ylabel('True Positive Rate')
print('Area under the Receiver Operating Characteristic curve:', roc_auc4)

Area under the Receiver Operating Characteristic curve: 0.9372276440834361

table = pd.DataFrame({'Algorithm': ['Logistic Regression', 'KNN','SVM',
                                    'Random Forest'], 
                      'Accurracy': [score1, score2, score3, score4], 
                      'ROC Score': [roc_auc1, roc_auc2, roc_auc3, roc_auc4]})

print('============ Model Summary ===========')
table

============ Model Summary ===========

	Month_Aug	Month_Dec	Month_Feb	Month_Jul	Month_June	Month_Mar	Month_May	Month_Nov	Month_Oct	Month_Sep	...	Informational	Informational_Duration	ProductRelated	ProductRelated_Duration	BounceRates	ExitRates	PageValues	SpecialDay	Weekend	Purchase
count	12330.000000	12330.000000	12330.000000	12330.000000	12330.000000	12330.000000	12330.000000	12330.000000	12330.000000	12330.000000	...	12330.000000	12330.000000	12330.000000	12330.000000	12330.000000	12330.000000	12330.000000	12330.000000	12330.000000	12330.000000
mean	0.035118	0.140065	0.014923	0.035036	0.023358	0.154663	0.272830	0.243147	0.044526	0.036334	...	0.503569	34.472398	31.731468	1194.746220	0.022191	0.043073	5.889258	0.061427	0.232603	0.154745
std	0.184084	0.347068	0.121250	0.183880	0.151043	0.361598	0.445432	0.429000	0.206268	0.187128	...	1.270156	140.749294	44.475503	1913.669288	0.048488	0.048597	18.568437	0.198917	0.422509	0.361676
min	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	...	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000
25%	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	...	0.000000	0.000000	7.000000	184.137500	0.000000	0.014286	0.000000	0.000000	0.000000	0.000000
50%	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	...	0.000000	0.000000	18.000000	598.936905	0.003112	0.025156	0.000000	0.000000	0.000000	0.000000
75%	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	1.000000	0.000000	0.000000	0.000000	...	0.000000	0.000000	38.000000	1464.157213	0.016813	0.050000	0.000000	0.000000	0.000000	0.000000
max	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000	...	24.000000	2549.375000	705.000000	63973.522230	0.200000	0.200000	361.763742	1.000000	1.000000	1.000000

Weight	Feature
0.0760 ± 0.0075	PageValues
0.0024 ± 0.0021	VisitorType_Returning_Visitor
0.0009 ± 0.0024	Month_Nov
0.0009 ± 0.0012	Month_Dec
0.0006 ± 0.0008	ProductRelated_Duration
0.0006 ± 0.0008	Informational
0.0003 ± 0.0003	SpecialDay
0.0003 ± 0.0006	Month_Oct
0.0002 ± 0.0004	Administrative_Duration
0.0002 ± 0.0012	Month_Mar
0.0001 ± 0.0003	Month_June
0.0001 ± 0.0003	Month_Aug
0 ± 0.0000	VisitorType_Other
0 ± 0.0000	BounceRates
0 ± 0.0000	Informational_Duration
0 ± 0.0000	ExitRates
0.0000 ± 0.0005	Month_Feb
0 ± 0.0000	Month_Sep
-0.0001 ± 0.0003	Month_Jul
-0.0004 ± 0.0030	Month_May
… 4 more …

Weight	Feature
0.0651 ± 0.0045	PageValues
0.0073 ± 0.0046	ProductRelated_Duration
0.0040 ± 0.0076	Administrative_Duration
0.0018 ± 0.0042	ProductRelated
0.0003 ± 0.0037	Informational_Duration
0.0002 ± 0.0004	Administrative
0 ± 0.0000	Month_Jul
0 ± 0.0000	Month_June
0 ± 0.0000	Month_Mar
0 ± 0.0000	BounceRates
0 ± 0.0000	Month_May
0 ± 0.0000	Month_Nov
0 ± 0.0000	Month_Oct
0 ± 0.0000	Month_Sep
0 ± 0.0000	Month_Aug
0 ± 0.0000	Month_Dec
0 ± 0.0000	VisitorType_Other
0 ± 0.0000	SpecialDay
0 ± 0.0000	Month_Feb
0 ± 0.0000	Weekend
… 4 more …

Weight	Feature
0.1005 ± 0.0082	PageValues
0.0019 ± 0.0084	ProductRelated_Duration
0.0015 ± 0.0016	Month_Nov
0.0003 ± 0.0008	ProductRelated
0.0002 ± 0.0004	ExitRates
0.0002 ± 0.0004	BounceRates
0.0001 ± 0.0003	Month_Aug
0.0001 ± 0.0012	Month_May
0 ± 0.0000	Administrative_Duration
0 ± 0.0000	VisitorType_Other
0 ± 0.0000	Month_Mar
-0.0001 ± 0.0006	Informational
-0.0002 ± 0.0012	VisitorType_Returning_Visitor
-0.0002 ± 0.0006	Month_Oct
-0.0002 ± 0.0006	Administrative
-0.0003 ± 0.0006	VisitorType_New_Visitor
-0.0003 ± 0.0003	Weekend
-0.0004 ± 0.0000	Month_Feb
-0.0005 ± 0.0006	Month_Jul
-0.0005 ± 0.0003	Month_June
… 4 more …

Weight	Feature
0.1241 ± 0.0060	PageValues
0.0102 ± 0.0070	ExitRates
0.0101 ± 0.0041	ProductRelated
0.0077 ± 0.0034	Month_Nov
0.0076 ± 0.0059	BounceRates
0.0061 ± 0.0057	ProductRelated_Duration
0.0060 ± 0.0033	Administrative
0.0024 ± 0.0006	Informational_Duration
0.0022 ± 0.0030	Administrative_Duration
0.0018 ± 0.0014	Month_Mar
0.0014 ± 0.0011	Month_May
0.0013 ± 0.0027	Informational
0.0009 ± 0.0013	Month_Dec
0.0008 ± 0.0014	Weekend
0.0006 ± 0.0031	VisitorType_Returning_Visitor
0.0006 ± 0.0008	SpecialDay
0.0005 ± 0.0003	VisitorType_New_Visitor
0.0005 ± 0.0006	Month_Jul
0.0004 ± 0.0000	Month_June
0.0002 ± 0.0004	Month_Sep
… 4 more …

Online Shopper Purchasing Prediction¶

Names¶

Abstract¶

Background¶

Problem Statement¶

Data¶

Setup¶

Data Cleaning & Data Wrangling¶

Exploratory Data Analysis (EDA)¶

Examinate the Distribution of `Administrative`, `Informational` and `ProductRelated`¶

Examinate the Distribution of `Administrative_Duration`, `Informational_Duration` and `ProductRelated_Duration`¶

Examinate the Association Between `Bounce Rate` and `Exit Rate`¶

Comparing the Purchase Choice between Special Day and Normal Day¶

Comparing the Purchase Choice Made in Each Month¶

Comparing the Purchase Choice between Weekday and Weekend¶

The Distribution of Visitor Types¶

Proposed Solution¶

Evaluation Metrics¶

Train-Test Spilt¶

Logistic Regression¶

K-Nearest Neighbors Classification (KNN)¶

Support Vector Machine(SVM)¶

Random Forest¶

Results¶

Dataset and Algorithms Analysis¶

Feature Selection¶

Hyper-Parameters Selection¶

Model Selection:¶

Logistic Regression:¶

K-Nearest Neighbor:¶

Support Vector Machine:¶

Random Forest:¶

Discussion¶

Interpreting the result¶

Why random forest?¶

Page value¶

Limitations¶

Ethics & Privacy¶

Conclusion¶

Footnotes¶

	ProductRelated	ProductRelated_Duration	BounceRates	ExitRates	Month	OperatingSystems	Browser	Region	TrafficType	VisitorType	Weekend	Revenue
0	1	0.000000	0.20	0.20	Feb	1	1	1	1	Returning_Visitor	False	False
1	2	64.000000	0.00	0.10	Feb	2	2	1	2	Returning_Visitor	False	False
2	1	0.000000	0.20	0.20	Feb	4	1	9	3	Returning_Visitor	False	False
3	2	2.666667	0.05	0.14	Feb	3	2	2	4	Returning_Visitor	False	False
4	10	627.500000	0.02	0.05	Feb	3	3	1	4	Returning_Visitor	True	False

	Algorithm	Accurracy	ROC Score
0	Logistic Regression	0.890916	0.909852
1	KNN	0.889700	0.865143
2	SVM	0.903893	0.864946
3	Random Forest	0.910381	0.937228

Online Shopper Purchasing Prediction¶

Names¶

Abstract¶

Background¶

Problem Statement¶

Data¶

Setup¶

Data Cleaning & Data Wrangling¶

Exploratory Data Analysis (EDA)¶

Examinate the Distribution of Administrative, Informational and ProductRelated¶

Examinate the Distribution of Administrative_Duration, Informational_Duration and ProductRelated_Duration¶

Examinate the Association Between Bounce Rate and Exit Rate¶

Comparing the Purchase Choice between Special Day and Normal Day¶

Comparing the Purchase Choice Made in Each Month¶

Comparing the Purchase Choice between Weekday and Weekend¶

The Distribution of Visitor Types¶

Proposed Solution¶

Evaluation Metrics¶

Train-Test Spilt¶

Logistic Regression¶

K-Nearest Neighbors Classification (KNN)¶

Support Vector Machine(SVM)¶

Random Forest¶

Results¶

Dataset and Algorithms Analysis¶

Feature Selection¶

Hyper-Parameters Selection¶

Model Selection:¶

Logistic Regression:¶

K-Nearest Neighbor:¶

Support Vector Machine:¶

Random Forest:¶

Discussion¶

Interpreting the result¶

Why random forest?¶

Page value¶

Limitations¶

Ethics & Privacy¶

Conclusion¶

Footnotes¶

Examinate the Distribution of `Administrative`, `Informational` and `ProductRelated`¶

Examinate the Distribution of `Administrative_Duration`, `Informational_Duration` and `ProductRelated_Duration`¶

Examinate the Association Between `Bounce Rate` and `Exit Rate`¶