Importing necessary libraries¶
In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import warnings
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
import gc
warnings.filterwarnings("ignore")
plt.style.use("seaborn-v0_8")
Loading the cleaned dataset from our previous work¶
In [2]:
df = pd.read_csv('data/titanic_cleaned.csv')
df.head()
Out[2]:
passengerid | survived | pclass | name | sex | age | sibsp | parch | ticket | fare | embarked | title | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 0 | 3 | Braund, Mr. Owen Harris | male | 22.0 | 1 | 0 | A/5 21171 | 7.2500 | S | Mr |
1 | 2 | 1 | 1 | Cumings, Mrs. John Bradley (Florence Briggs Th... | female | 38.0 | 1 | 0 | PC 17599 | 71.2833 | C | Mrs |
2 | 3 | 1 | 3 | Heikkinen, Miss. Laina | female | 26.0 | 0 | 0 | STON/O2. 3101282 | 7.9250 | S | Miss |
3 | 4 | 1 | 1 | Futrelle, Mrs. Jacques Heath (Lily May Peel) | female | 35.0 | 1 | 0 | 113803 | 53.1000 | S | Mrs |
4 | 5 | 0 | 3 | Allen, Mr. William Henry | male | 35.0 | 0 | 0 | 373450 | 8.0500 | S | Mr |
Feature Selection¶
In [3]:
print("*" * 100)
print("Features Information")
display(df.info())
print("*" * 100)
print("Unique Values in Each Column")
display(df.nunique())
print("*" * 100)
print("Descriptive Statistics")
display(df.describe())
print("*" * 100)
**************************************************************************************************** Features Information <class 'pandas.core.frame.DataFrame'> RangeIndex: 891 entries, 0 to 890 Data columns (total 12 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 passengerid 891 non-null int64 1 survived 891 non-null int64 2 pclass 891 non-null int64 3 name 891 non-null object 4 sex 891 non-null object 5 age 891 non-null float64 6 sibsp 891 non-null int64 7 parch 891 non-null int64 8 ticket 891 non-null object 9 fare 891 non-null float64 10 embarked 891 non-null object 11 title 891 non-null object dtypes: float64(2), int64(5), object(5) memory usage: 83.7+ KB
None
**************************************************************************************************** Unique Values in Each Column
passengerid 891 survived 2 pclass 3 name 891 sex 2 age 88 sibsp 7 parch 7 ticket 681 fare 248 embarked 3 title 5 dtype: int64
**************************************************************************************************** Descriptive Statistics
passengerid | survived | pclass | age | sibsp | parch | fare | |
---|---|---|---|---|---|---|---|
count | 891.000000 | 891.000000 | 891.000000 | 891.000000 | 891.000000 | 891.000000 | 891.000000 |
mean | 446.000000 | 0.383838 | 2.308642 | 29.659001 | 0.523008 | 0.381594 | 32.204208 |
std | 257.353842 | 0.486592 | 0.836071 | 13.289967 | 1.102743 | 0.806057 | 49.693429 |
min | 1.000000 | 0.000000 | 1.000000 | 0.420000 | 0.000000 | 0.000000 | 0.000000 |
25% | 223.500000 | 0.000000 | 2.000000 | 21.000000 | 0.000000 | 0.000000 | 7.910400 |
50% | 446.000000 | 0.000000 | 3.000000 | 30.000000 | 0.000000 | 0.000000 | 14.454200 |
75% | 668.500000 | 1.000000 | 3.000000 | 35.000000 | 1.000000 | 0.000000 | 31.000000 |
max | 891.000000 | 1.000000 | 3.000000 | 80.000000 | 8.000000 | 6.000000 | 512.329200 |
****************************************************************************************************
In [4]:
from scipy.stats import chi2_contingency
# Perform Chi-Squared Test for Categorical Features
for feat in ["pclass", "sex", "embarked", "title", "ticket"]:
df[feat] = df[feat].astype("category")
contingency_table = pd.crosstab(df[feat], df["survived"])
display(pd.DataFrame(contingency_table))
chi2, p, dof, expected = chi2_contingency(contingency_table)
print(f"Chi2: {chi2}, p-value: {p}")
if feat != "ticket":
sns.countplot(data=df, x=feat, hue="survived")
plt.title(f"Survival Count by {feat}")
plt.show()
survived | 0 | 1 |
---|---|---|
pclass | ||
1 | 80 | 136 |
2 | 97 | 87 |
3 | 372 | 119 |
Chi2: 102.88898875696056, p-value: 4.549251711298793e-23
survived | 0 | 1 |
---|---|---|
sex | ||
female | 81 | 233 |
male | 468 | 109 |
Chi2: 260.71702016732104, p-value: 1.1973570627755645e-58
survived | 0 | 1 |
---|---|---|
embarked | ||
C | 75 | 93 |
Q | 47 | 30 |
S | 427 | 219 |
Chi2: 25.964452881874784, p-value: 2.3008626481449577e-06
survived | 0 | 1 |
---|---|---|
title | ||
Master | 17 | 23 |
Miss | 55 | 129 |
Mr | 437 | 82 |
Mrs | 26 | 103 |
Officer | 14 | 5 |
Chi2: 291.3420130772467, p-value: 7.984173381155626e-62
survived | 0 | 1 |
---|---|---|
ticket | ||
110152 | 0 | 3 |
110413 | 1 | 2 |
110465 | 2 | 0 |
110564 | 0 | 1 |
110813 | 0 | 1 |
... | ... | ... |
W./C. 6608 | 4 | 0 |
W./C. 6609 | 1 | 0 |
W.E.P. 5734 | 1 | 0 |
W/C 14208 | 1 | 0 |
WE/P 5735 | 1 | 1 |
681 rows × 2 columns
Chi2: 766.5697029458893, p-value: 0.01152729601163775
Split the data into features and target variable Dependent Vs Independent Variables¶
In [5]:
X = df[["sex", "age", "fare", "pclass", "sibsp", "parch", "embarked", "title"]]
y = df["survived"]
print("X: ", X.shape)
print("y: ", y.shape)
X: (891, 8) y: (891,)
In [6]:
X.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 891 entries, 0 to 890 Data columns (total 8 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 sex 891 non-null category 1 age 891 non-null float64 2 fare 891 non-null float64 3 pclass 891 non-null category 4 sibsp 891 non-null int64 5 parch 891 non-null int64 6 embarked 891 non-null category 7 title 891 non-null category dtypes: category(4), float64(2), int64(2) memory usage: 32.0 KB
We need to convert categorical variables into numerical format¶
In [7]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
In [8]:
numerical_cols = X.select_dtypes(include=["int64", "float64"]).columns
print(numerical_cols) # ['age', 'fare', 'sibsp', 'parch']
categorical_cols = X.select_dtypes("category").columns
print(categorical_cols) # ['sex', 'pclass', 'embarked', 'title']
Index(['age', 'fare', 'sibsp', 'parch'], dtype='object') Index(['sex', 'pclass', 'embarked', 'title'], dtype='object')
In [49]:
categorical_pipeline = Pipeline(
[
("imputer", SimpleImputer(strategy="most_frequent")),
("encoder", OneHotEncoder(drop="first", handle_unknown="ignore")),
]
)
numerical_pipeline = Pipeline(
[
("imputer", SimpleImputer(strategy="mean")),
(
"scaler",
StandardScaler(),
),
]
)
preprocessor = ColumnTransformer(
transformers=[
("num", numerical_pipeline, numerical_cols),
("cat", categorical_pipeline, categorical_cols),
]
)
# Final pipeline
model_pipeline = Pipeline(
steps=[
("preprocessing", preprocessor),
# ("classifier", RandomForestClassifier(random_state=42)),
("classifier", LogisticRegression(random_state=42)),
# ("classifier", DecisionTreeClassifier(random_state=42)),
]
)
model_pipeline
Out[49]:
Pipeline(steps=[('preprocessing', ColumnTransformer(transformers=[('num', Pipeline(steps=[('imputer', SimpleImputer()), ('scaler', StandardScaler())]), Index(['age', 'fare', 'sibsp', 'parch'], dtype='object')), ('cat', Pipeline(steps=[('imputer', SimpleImputer(strategy='most_frequent')), ('encoder', OneHotEncoder(drop='first', handle_unknown='ignore'))]), Index(['sex', 'pclass', 'embarked', 'title'], dtype='object'))])), ('classifier', LogisticRegression(random_state=42))])In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
Parameters
steps | [('preprocessing', ...), ('classifier', ...)] | |
transform_input | None | |
memory | None | |
verbose | False |
Parameters
transformers | [('num', ...), ('cat', ...)] | |
remainder | 'drop' | |
sparse_threshold | 0.3 | |
n_jobs | None | |
transformer_weights | None | |
verbose | False | |
verbose_feature_names_out | True | |
force_int_remainder_cols | 'deprecated' |
Index(['age', 'fare', 'sibsp', 'parch'], dtype='object')
Parameters
missing_values | nan | |
strategy | 'mean' | |
fill_value | None | |
copy | True | |
add_indicator | False | |
keep_empty_features | False |
Parameters
copy | True | |
with_mean | True | |
with_std | True |
Index(['sex', 'pclass', 'embarked', 'title'], dtype='object')
Parameters
missing_values | nan | |
strategy | 'most_frequent' | |
fill_value | None | |
copy | True | |
add_indicator | False | |
keep_empty_features | False |
Parameters
categories | 'auto' | |
drop | 'first' | |
sparse_output | True | |
dtype | <class 'numpy.float64'> | |
handle_unknown | 'ignore' | |
min_frequency | None | |
max_categories | None | |
feature_name_combiner | 'concat' |
Parameters
penalty | 'l2' | |
dual | False | |
tol | 0.0001 | |
C | 1.0 | |
fit_intercept | True | |
intercept_scaling | 1 | |
class_weight | None | |
random_state | 42 | |
solver | 'lbfgs' | |
max_iter | 100 | |
multi_class | 'deprecated' | |
verbose | 0 | |
warm_start | False | |
n_jobs | None | |
l1_ratio | None |
Split the data into features and target variable¶
In [50]:
### Split the data into features and target variable
X_train, X_test, y_train, y_test = train_test_split(
X,
y,
test_size=0.2,
random_state=42,
stratify=y,
)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)
gc.collect()
(712, 8) (179, 8) (712,) (179,)
Out[50]:
0
In [51]:
model = model_pipeline.named_steps["classifier"]
model
Out[51]:
LogisticRegression(random_state=42)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
Parameters
penalty | 'l2' | |
dual | False | |
tol | 0.0001 | |
C | 1.0 | |
fit_intercept | True | |
intercept_scaling | 1 | |
class_weight | None | |
random_state | 42 | |
solver | 'lbfgs' | |
max_iter | 100 | |
multi_class | 'deprecated' | |
verbose | 0 | |
warm_start | False | |
n_jobs | None | |
l1_ratio | None |
Applying preprocessing steps and fitting the model¶
In [52]:
X_train_processed = preprocessor.fit_transform(X_train)
X_test_processed = preprocessor.transform(X_test)
In [53]:
encoded_columns = (
model_pipeline[0]
.transformers_[1][1]
.named_steps["encoder"]
.get_feature_names_out(categorical_cols)
)
# Combine the numerical columns and encoded categorical columns
all_columns = list(numerical_cols) + list(encoded_columns)
# Convert to DataFrame
X_train_df = pd.DataFrame(X_train_processed, columns=all_columns)
X_test_df = pd.DataFrame(X_test_processed, columns=all_columns)
X_train_df.head(10)
Out[53]:
age | fare | sibsp | parch | sex_male | pclass_2 | pclass_3 | embarked_Q | embarked_S | title_Miss | title_Mr | title_Mrs | title_Officer | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0.166879 | 0.513812 | -0.465084 | -0.466183 | 1.0 | 0.0 | 1.0 | 0.0 | 1.0 | 0.0 | 1.0 | 0.0 | 0.0 |
1 | 0.166879 | -0.662563 | -0.465084 | -0.466183 | 1.0 | 1.0 | 0.0 | 0.0 | 1.0 | 0.0 | 1.0 | 0.0 | 0.0 |
2 | 0.166879 | 3.955399 | -0.465084 | -0.466183 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 1.0 | 0.0 | 0.0 |
3 | -0.887551 | -0.467874 | -0.465084 | 0.727782 | 0.0 | 0.0 | 1.0 | 0.0 | 1.0 | 0.0 | 0.0 | 1.0 | 0.0 |
4 | 0.091562 | -0.115977 | 0.478335 | 0.727782 | 0.0 | 1.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 1.0 | 0.0 |
5 | -0.661602 | -0.486962 | -0.465084 | -0.466183 | 1.0 | 0.0 | 1.0 | 0.0 | 1.0 | 0.0 | 1.0 | 0.0 | 0.0 |
6 | -0.285020 | 0.513812 | -0.465084 | -0.466183 | 1.0 | 0.0 | 1.0 | 0.0 | 1.0 | 0.0 | 1.0 | 0.0 | 0.0 |
7 | 0.166879 | 4.075040 | -0.465084 | -0.466183 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 |
8 | 0.166879 | -0.501190 | -0.465084 | -0.466183 | 1.0 | 0.0 | 1.0 | 1.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 |
9 | 0.091562 | -0.287761 | 0.478335 | -0.466183 | 0.0 | 0.0 | 1.0 | 0.0 | 1.0 | 0.0 | 0.0 | 1.0 | 0.0 |
In [54]:
model.fit(X_train_df, y_train)
Out[54]:
LogisticRegression(random_state=42)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
Parameters
penalty | 'l2' | |
dual | False | |
tol | 0.0001 | |
C | 1.0 | |
fit_intercept | True | |
intercept_scaling | 1 | |
class_weight | None | |
random_state | 42 | |
solver | 'lbfgs' | |
max_iter | 100 | |
multi_class | 'deprecated' | |
verbose | 0 | |
warm_start | False | |
n_jobs | None | |
l1_ratio | None |
Generating Predictions on Test Data¶
In [55]:
y_pred = model_pipeline.predict(X_test)
y_proba = model_pipeline.predict_proba(X_test)[:, 1]
print(y_pred[:10])
print(y_proba[:10])
[0 0 0 0 1 1 1 0 0 0] [0.05099579 0.04055293 0.15135114 0.02703658 0.66735637 0.60975207 0.7190783 0.49081549 0.26258101 0.12986952]
Evaluating the Model¶
In [56]:
from sklearn.metrics import (
classification_report,
confusion_matrix,
accuracy_score,
roc_curve,
roc_auc_score,
precision_recall_curve,
average_precision_score,
)
print("Classification Report: \n", classification_report(y_test, y_pred))
print("Model Accuracy: ", f"{accuracy_score(y_test, y_pred) * 100:.2f}%")
Classification Report: precision recall f1-score support 0 0.84 0.90 0.87 110 1 0.82 0.72 0.77 69 accuracy 0.83 179 macro avg 0.83 0.81 0.82 179 weighted avg 0.83 0.83 0.83 179 Model Accuracy: 83.24%
In [57]:
cm = confusion_matrix(y_test, y_pred)
class_names = ["Not Survived", "Survived"]
# Plot
plt.figure(figsize=(8, 6))
sns.heatmap(
cm,
annot=True,
fmt="d",
cmap="Blues",
xticklabels=class_names,
yticklabels=class_names,
)
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.title("Confusion Matrix")
plt.tight_layout()
plt.show()
In [58]:
fpr, tpr, thresholds = roc_curve(y_test, y_proba)
auc = roc_auc_score(y_test, y_proba)
plt.figure()
plt.plot(fpr, tpr, label=f"AUC = {auc:.2f}")
plt.plot([0, 1], [0, 1], 'k--')
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curve")
plt.legend()
plt.grid(True)
plt.show()
In [59]:
precision, recall, _ = precision_recall_curve(y_test, y_proba)
avg_precision = average_precision_score(y_test, y_proba)
plt.figure()
plt.plot(recall, precision, label=f"Avg Precision = {avg_precision:.2f}")
plt.plot([1, 0], [0, 1], 'k--')
plt.xlabel("Recall")
plt.ylabel("Precision")
plt.title("Precision-Recall Curve")
plt.legend()
plt.grid(True)
plt.show()
Saving the model¶
In [ ]:
# import pickle
# # Save the pipeline to a file
# with open("models/model_pipeline.pkl", "wb") as f:
# pickle.dump(model_pipeline, f)
Loading the model¶
In [21]:
X[:20]
Out[21]:
sex | age | fare | pclass | sibsp | parch | embarked | title | |
---|---|---|---|---|---|---|---|---|
0 | male | 22.0 | 7.2500 | 3 | 1 | 0 | S | Mr |
1 | female | 38.0 | 71.2833 | 1 | 1 | 0 | C | Mrs |
2 | female | 26.0 | 7.9250 | 3 | 0 | 0 | S | Miss |
3 | female | 35.0 | 53.1000 | 1 | 1 | 0 | S | Mrs |
4 | male | 35.0 | 8.0500 | 3 | 0 | 0 | S | Mr |
5 | male | 32.0 | 8.4583 | 3 | 0 | 0 | Q | Mr |
6 | male | 54.0 | 51.8625 | 1 | 0 | 0 | S | Mr |
7 | male | 2.0 | 21.0750 | 3 | 3 | 1 | S | Master |
8 | female | 27.0 | 11.1333 | 3 | 0 | 2 | S | Mrs |
9 | female | 14.0 | 30.0708 | 2 | 1 | 0 | C | Mrs |
10 | female | 4.0 | 16.7000 | 3 | 1 | 1 | S | Miss |
11 | female | 58.0 | 26.5500 | 1 | 0 | 0 | S | Miss |
12 | male | 20.0 | 8.0500 | 3 | 0 | 0 | S | Mr |
13 | male | 39.0 | 31.2750 | 3 | 1 | 5 | S | Mr |
14 | female | 14.0 | 7.8542 | 3 | 0 | 0 | S | Miss |
15 | female | 55.0 | 16.0000 | 2 | 0 | 0 | S | Mrs |
16 | male | 2.0 | 29.1250 | 3 | 4 | 1 | Q | Master |
17 | male | 32.0 | 13.0000 | 2 | 0 | 0 | S | Mr |
18 | female | 31.0 | 18.0000 | 3 | 1 | 0 | S | Mrs |
19 | female | 35.0 | 7.2250 | 3 | 0 | 0 | C | Mrs |
In [ ]:
# import pickle
# loaded_model = pickle.load(open("models/model_pipeline.pkl", "rb"))
# # Use it for prediction
# print("Model Predictions: ", loaded_model.predict(X[:20]))
# print("Original Labels: ", y[:20].values)
# print("Model Probabilities: ", loaded_model.predict_proba(X[:20])[:, 1])
# print("Model Score: ", loaded_model.score(X[:20], y[:20]))
Model Predictions: [0 1 1 1 0 0 0 0 1 1 1 1 0 0 0 1 0 0 0 1] Original Labels: [0 1 1 1 0 0 0 0 1 1 1 1 0 0 0 1 0 1 0 1] Model Probabilities: [0.08 0.99 0.7 1. 0.02170042 0.00333333 0.03 0.12 0.88 0.98 0.91 0.77 0.099 0.02 0.09 0.94 0.16 0.436 0.26 0.9 ] Model Score: 0.95