Importing necessary libraries¶

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import warnings
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
import gc

warnings.filterwarnings("ignore")
plt.style.use("seaborn-v0_8")

Loading the cleaned dataset from our previous work¶

In [2]:
df = pd.read_csv('data/titanic_cleaned.csv')
df.head()
Out[2]:
passengerid survived pclass name sex age sibsp parch ticket fare embarked title
0 1 0 3 Braund, Mr. Owen Harris male 22.0 1 0 A/5 21171 7.2500 S Mr
1 2 1 1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1 0 PC 17599 71.2833 C Mrs
2 3 1 3 Heikkinen, Miss. Laina female 26.0 0 0 STON/O2. 3101282 7.9250 S Miss
3 4 1 1 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 1 0 113803 53.1000 S Mrs
4 5 0 3 Allen, Mr. William Henry male 35.0 0 0 373450 8.0500 S Mr

Feature Selection¶

In [3]:
print("*" * 100)
print("Features Information")
display(df.info())
print("*" * 100)
print("Unique Values in Each Column")
display(df.nunique())
print("*" * 100)
print("Descriptive Statistics")
display(df.describe())
print("*" * 100)
****************************************************************************************************
Features Information
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   passengerid  891 non-null    int64  
 1   survived     891 non-null    int64  
 2   pclass       891 non-null    int64  
 3   name         891 non-null    object 
 4   sex          891 non-null    object 
 5   age          891 non-null    float64
 6   sibsp        891 non-null    int64  
 7   parch        891 non-null    int64  
 8   ticket       891 non-null    object 
 9   fare         891 non-null    float64
 10  embarked     891 non-null    object 
 11  title        891 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB
None
****************************************************************************************************
Unique Values in Each Column
passengerid    891
survived         2
pclass           3
name           891
sex              2
age             88
sibsp            7
parch            7
ticket         681
fare           248
embarked         3
title            5
dtype: int64
****************************************************************************************************
Descriptive Statistics
passengerid survived pclass age sibsp parch fare
count 891.000000 891.000000 891.000000 891.000000 891.000000 891.000000 891.000000
mean 446.000000 0.383838 2.308642 29.659001 0.523008 0.381594 32.204208
std 257.353842 0.486592 0.836071 13.289967 1.102743 0.806057 49.693429
min 1.000000 0.000000 1.000000 0.420000 0.000000 0.000000 0.000000
25% 223.500000 0.000000 2.000000 21.000000 0.000000 0.000000 7.910400
50% 446.000000 0.000000 3.000000 30.000000 0.000000 0.000000 14.454200
75% 668.500000 1.000000 3.000000 35.000000 1.000000 0.000000 31.000000
max 891.000000 1.000000 3.000000 80.000000 8.000000 6.000000 512.329200
****************************************************************************************************

Dependent VS Independent Variables¶

Testing the Feature Relevance using Chi-Square Test¶

In [4]:
from scipy.stats import chi2_contingency

# Perform Chi-Squared Test for Categorical Features

for feat in ["pclass", "sex", "embarked", "title", "ticket"]:
    df[feat] = df[feat].astype("category")
    contingency_table = pd.crosstab(df[feat], df["survived"])
    display(pd.DataFrame(contingency_table))
    chi2, p, dof, expected = chi2_contingency(contingency_table)
    print(f"Chi2: {chi2}, p-value: {p}")
    if feat != "ticket":
        sns.countplot(data=df, x=feat, hue="survived")
        plt.title(f"Survival Count by {feat}")
        plt.show()
survived 0 1
pclass
1 80 136
2 97 87
3 372 119
Chi2: 102.88898875696056, p-value: 4.549251711298793e-23
No description has been provided for this image
survived 0 1
sex
female 81 233
male 468 109
Chi2: 260.71702016732104, p-value: 1.1973570627755645e-58
No description has been provided for this image
survived 0 1
embarked
C 75 93
Q 47 30
S 427 219
Chi2: 25.964452881874784, p-value: 2.3008626481449577e-06
No description has been provided for this image
survived 0 1
title
Master 17 23
Miss 55 129
Mr 437 82
Mrs 26 103
Officer 14 5
Chi2: 291.3420130772467, p-value: 7.984173381155626e-62
No description has been provided for this image
survived 0 1
ticket
110152 0 3
110413 1 2
110465 2 0
110564 0 1
110813 0 1
... ... ...
W./C. 6608 4 0
W./C. 6609 1 0
W.E.P. 5734 1 0
W/C 14208 1 0
WE/P 5735 1 1

681 rows × 2 columns

Chi2: 766.5697029458893, p-value: 0.01152729601163775

Split the data into features and target variable Dependent Vs Independent Variables¶

In [5]:
X = df[["sex", "age", "fare", "pclass", "sibsp", "parch", "embarked", "title"]]
y = df["survived"]

print("X: ", X.shape)
print("y: ", y.shape)
X:  (891, 8)
y:  (891,)
In [6]:
X.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype   
---  ------    --------------  -----   
 0   sex       891 non-null    category
 1   age       891 non-null    float64 
 2   fare      891 non-null    float64 
 3   pclass    891 non-null    category
 4   sibsp     891 non-null    int64   
 5   parch     891 non-null    int64   
 6   embarked  891 non-null    category
 7   title     891 non-null    category
dtypes: category(4), float64(2), int64(2)
memory usage: 32.0 KB

We need to convert categorical variables into numerical format¶

In [7]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
In [8]:
numerical_cols = X.select_dtypes(include=["int64", "float64"]).columns
print(numerical_cols)  # ['age', 'fare', 'sibsp', 'parch']
categorical_cols = X.select_dtypes("category").columns
print(categorical_cols)  # ['sex', 'pclass', 'embarked', 'title']
Index(['age', 'fare', 'sibsp', 'parch'], dtype='object')
Index(['sex', 'pclass', 'embarked', 'title'], dtype='object')
In [49]:
categorical_pipeline = Pipeline(
    [
        ("imputer", SimpleImputer(strategy="most_frequent")),
        ("encoder", OneHotEncoder(drop="first", handle_unknown="ignore")),
    ]
)
numerical_pipeline = Pipeline(
    [
        ("imputer", SimpleImputer(strategy="mean")),
        (
            "scaler",
            StandardScaler(),
        ),
    ]
)
preprocessor = ColumnTransformer(
    transformers=[
        ("num", numerical_pipeline, numerical_cols),
        ("cat", categorical_pipeline, categorical_cols),
    ]
)

# Final pipeline
model_pipeline = Pipeline(
    steps=[
        ("preprocessing", preprocessor),
        # ("classifier", RandomForestClassifier(random_state=42)),
        ("classifier", LogisticRegression(random_state=42)),
        # ("classifier", DecisionTreeClassifier(random_state=42)),
    ]
)

model_pipeline
Out[49]:
Pipeline(steps=[('preprocessing',
                 ColumnTransformer(transformers=[('num',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer()),
                                                                  ('scaler',
                                                                   StandardScaler())]),
                                                  Index(['age', 'fare', 'sibsp', 'parch'], dtype='object')),
                                                 ('cat',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(strategy='most_frequent')),
                                                                  ('encoder',
                                                                   OneHotEncoder(drop='first',
                                                                                 handle_unknown='ignore'))]),
                                                  Index(['sex', 'pclass', 'embarked', 'title'], dtype='object'))])),
                ('classifier', LogisticRegression(random_state=42))])
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
Parameters
steps  [('preprocessing', ...), ('classifier', ...)]
transform_input  None
memory  None
verbose  False
Parameters
transformers  [('num', ...), ('cat', ...)]
remainder  'drop'
sparse_threshold  0.3
n_jobs  None
transformer_weights  None
verbose  False
verbose_feature_names_out  True
force_int_remainder_cols  'deprecated'
Index(['age', 'fare', 'sibsp', 'parch'], dtype='object')
Parameters
missing_values  nan
strategy  'mean'
fill_value  None
copy  True
add_indicator  False
keep_empty_features  False
Parameters
copy  True
with_mean  True
with_std  True
Index(['sex', 'pclass', 'embarked', 'title'], dtype='object')
Parameters
missing_values  nan
strategy  'most_frequent'
fill_value  None
copy  True
add_indicator  False
keep_empty_features  False
Parameters
categories  'auto'
drop  'first'
sparse_output  True
dtype  <class 'numpy.float64'>
handle_unknown  'ignore'
min_frequency  None
max_categories  None
feature_name_combiner  'concat'
Parameters
penalty  'l2'
dual  False
tol  0.0001
C  1.0
fit_intercept  True
intercept_scaling  1
class_weight  None
random_state  42
solver  'lbfgs'
max_iter  100
multi_class  'deprecated'
verbose  0
warm_start  False
n_jobs  None
l1_ratio  None

Split the data into features and target variable¶

In [50]:
### Split the data into features and target variable
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=42,
    stratify=y,
)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)
gc.collect()
(712, 8) (179, 8) (712,) (179,)
Out[50]:
0
In [51]:
model = model_pipeline.named_steps["classifier"]
model
Out[51]:
LogisticRegression(random_state=42)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
Parameters
penalty  'l2'
dual  False
tol  0.0001
C  1.0
fit_intercept  True
intercept_scaling  1
class_weight  None
random_state  42
solver  'lbfgs'
max_iter  100
multi_class  'deprecated'
verbose  0
warm_start  False
n_jobs  None
l1_ratio  None

Applying preprocessing steps and fitting the model¶

In [52]:
X_train_processed = preprocessor.fit_transform(X_train)
X_test_processed = preprocessor.transform(X_test)
In [53]:
encoded_columns = (
    model_pipeline[0]
    .transformers_[1][1]
    .named_steps["encoder"]
    .get_feature_names_out(categorical_cols)
)
# Combine the numerical columns and encoded categorical columns
all_columns = list(numerical_cols) + list(encoded_columns)

# Convert to DataFrame
X_train_df = pd.DataFrame(X_train_processed, columns=all_columns)
X_test_df = pd.DataFrame(X_test_processed, columns=all_columns)
X_train_df.head(10)
Out[53]:
age fare sibsp parch sex_male pclass_2 pclass_3 embarked_Q embarked_S title_Miss title_Mr title_Mrs title_Officer
0 0.166879 0.513812 -0.465084 -0.466183 1.0 0.0 1.0 0.0 1.0 0.0 1.0 0.0 0.0
1 0.166879 -0.662563 -0.465084 -0.466183 1.0 1.0 0.0 0.0 1.0 0.0 1.0 0.0 0.0
2 0.166879 3.955399 -0.465084 -0.466183 1.0 0.0 0.0 0.0 1.0 0.0 1.0 0.0 0.0
3 -0.887551 -0.467874 -0.465084 0.727782 0.0 0.0 1.0 0.0 1.0 0.0 0.0 1.0 0.0
4 0.091562 -0.115977 0.478335 0.727782 0.0 1.0 0.0 0.0 1.0 0.0 0.0 1.0 0.0
5 -0.661602 -0.486962 -0.465084 -0.466183 1.0 0.0 1.0 0.0 1.0 0.0 1.0 0.0 0.0
6 -0.285020 0.513812 -0.465084 -0.466183 1.0 0.0 1.0 0.0 1.0 0.0 1.0 0.0 0.0
7 0.166879 4.075040 -0.465084 -0.466183 1.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0
8 0.166879 -0.501190 -0.465084 -0.466183 1.0 0.0 1.0 1.0 0.0 0.0 1.0 0.0 0.0
9 0.091562 -0.287761 0.478335 -0.466183 0.0 0.0 1.0 0.0 1.0 0.0 0.0 1.0 0.0
In [54]:
model.fit(X_train_df, y_train)
Out[54]:
LogisticRegression(random_state=42)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
Parameters
penalty  'l2'
dual  False
tol  0.0001
C  1.0
fit_intercept  True
intercept_scaling  1
class_weight  None
random_state  42
solver  'lbfgs'
max_iter  100
multi_class  'deprecated'
verbose  0
warm_start  False
n_jobs  None
l1_ratio  None

Generating Predictions on Test Data¶

In [55]:
y_pred = model_pipeline.predict(X_test)
y_proba = model_pipeline.predict_proba(X_test)[:, 1]
print(y_pred[:10])
print(y_proba[:10])
[0 0 0 0 1 1 1 0 0 0]
[0.05099579 0.04055293 0.15135114 0.02703658 0.66735637 0.60975207
 0.7190783  0.49081549 0.26258101 0.12986952]

Evaluating the Model¶

In [56]:
from sklearn.metrics import (
    classification_report,
    confusion_matrix,
    accuracy_score,
    roc_curve,
    roc_auc_score,
    precision_recall_curve,
    average_precision_score,
)

print("Classification Report: \n", classification_report(y_test, y_pred))
print("Model Accuracy: ", f"{accuracy_score(y_test, y_pred) * 100:.2f}%")
Classification Report: 
               precision    recall  f1-score   support

           0       0.84      0.90      0.87       110
           1       0.82      0.72      0.77        69

    accuracy                           0.83       179
   macro avg       0.83      0.81      0.82       179
weighted avg       0.83      0.83      0.83       179

Model Accuracy:  83.24%
In [57]:
cm = confusion_matrix(y_test, y_pred)
class_names = ["Not Survived", "Survived"]

# Plot
plt.figure(figsize=(8, 6))
sns.heatmap(
    cm,
    annot=True,
    fmt="d",
    cmap="Blues",
    xticklabels=class_names,
    yticklabels=class_names,
)

plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.title("Confusion Matrix")
plt.tight_layout()
plt.show()
No description has been provided for this image
In [58]:
fpr, tpr, thresholds = roc_curve(y_test, y_proba)
auc = roc_auc_score(y_test, y_proba)

plt.figure()
plt.plot(fpr, tpr, label=f"AUC = {auc:.2f}")
plt.plot([0, 1], [0, 1], 'k--')
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curve")
plt.legend()
plt.grid(True)
plt.show()
No description has been provided for this image
In [59]:
precision, recall, _ = precision_recall_curve(y_test, y_proba)
avg_precision = average_precision_score(y_test, y_proba)

plt.figure()
plt.plot(recall, precision, label=f"Avg Precision = {avg_precision:.2f}")
plt.plot([1, 0], [0, 1], 'k--')
plt.xlabel("Recall")
plt.ylabel("Precision")
plt.title("Precision-Recall Curve")
plt.legend()
plt.grid(True)
plt.show()
No description has been provided for this image

Saving the model¶

In [ ]:
# import pickle

# # Save the pipeline to a file
# with open("models/model_pipeline.pkl", "wb") as f:
#     pickle.dump(model_pipeline, f)

Loading the model¶

In [21]:
X[:20]
Out[21]:
sex age fare pclass sibsp parch embarked title
0 male 22.0 7.2500 3 1 0 S Mr
1 female 38.0 71.2833 1 1 0 C Mrs
2 female 26.0 7.9250 3 0 0 S Miss
3 female 35.0 53.1000 1 1 0 S Mrs
4 male 35.0 8.0500 3 0 0 S Mr
5 male 32.0 8.4583 3 0 0 Q Mr
6 male 54.0 51.8625 1 0 0 S Mr
7 male 2.0 21.0750 3 3 1 S Master
8 female 27.0 11.1333 3 0 2 S Mrs
9 female 14.0 30.0708 2 1 0 C Mrs
10 female 4.0 16.7000 3 1 1 S Miss
11 female 58.0 26.5500 1 0 0 S Miss
12 male 20.0 8.0500 3 0 0 S Mr
13 male 39.0 31.2750 3 1 5 S Mr
14 female 14.0 7.8542 3 0 0 S Miss
15 female 55.0 16.0000 2 0 0 S Mrs
16 male 2.0 29.1250 3 4 1 Q Master
17 male 32.0 13.0000 2 0 0 S Mr
18 female 31.0 18.0000 3 1 0 S Mrs
19 female 35.0 7.2250 3 0 0 C Mrs
In [ ]:
# import pickle

# loaded_model = pickle.load(open("models/model_pipeline.pkl", "rb"))

# # Use it for prediction
# print("Model Predictions: ", loaded_model.predict(X[:20]))
# print("Original Labels: ", y[:20].values)
# print("Model Probabilities: ", loaded_model.predict_proba(X[:20])[:, 1])
# print("Model Score: ", loaded_model.score(X[:20], y[:20]))
Model Predictions:  [0 1 1 1 0 0 0 0 1 1 1 1 0 0 0 1 0 0 0 1]
Original Labels:  [0 1 1 1 0 0 0 0 1 1 1 1 0 0 0 1 0 1 0 1]
Model Probabilities:  [0.08       0.99       0.7        1.         0.02170042 0.00333333
 0.03       0.12       0.88       0.98       0.91       0.77
 0.099      0.02       0.09       0.94       0.16       0.436
 0.26       0.9       ]
Model Score:  0.95