import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import warnings
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
import gc

warnings.filterwarnings("ignore")
plt.style.use("seaborn-v0_8")

df = pd.read_csv('data/titanic_cleaned.csv')
df.head()

print("*" * 100)
print("Features Information")
display(df.info())
print("*" * 100)
print("Unique Values in Each Column")
display(df.nunique())
print("*" * 100)
print("Descriptive Statistics")
display(df.describe())
print("*" * 100)

****************************************************************************************************
Features Information
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   passengerid  891 non-null    int64  
 1   survived     891 non-null    int64  
 2   pclass       891 non-null    int64  
 3   name         891 non-null    object 
 4   sex          891 non-null    object 
 5   age          891 non-null    float64
 6   sibsp        891 non-null    int64  
 7   parch        891 non-null    int64  
 8   ticket       891 non-null    object 
 9   fare         891 non-null    float64
 10  embarked     891 non-null    object 
 11  title        891 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB

None

****************************************************************************************************
Unique Values in Each Column

passengerid    891
survived         2
pclass           3
name           891
sex              2
age             88
sibsp            7
parch            7
ticket         681
fare           248
embarked         3
title            5
dtype: int64

****************************************************************************************************
Descriptive Statistics

****************************************************************************************************

from scipy.stats import chi2_contingency

# Perform Chi-Squared Test for Categorical Features

for feat in ["pclass", "sex", "embarked", "title", "ticket"]:
    df[feat] = df[feat].astype("category")
    contingency_table = pd.crosstab(df[feat], df["survived"])
    display(pd.DataFrame(contingency_table))
    chi2, p, dof, expected = chi2_contingency(contingency_table)
    print(f"Chi2: {chi2}, p-value: {p}")
    if feat != "ticket":
        sns.countplot(data=df, x=feat, hue="survived")
        plt.title(f"Survival Count by {feat}")
        plt.show()

Chi2: 102.88898875696056, p-value: 4.549251711298793e-23

Chi2: 260.71702016732104, p-value: 1.1973570627755645e-58

Chi2: 25.964452881874784, p-value: 2.3008626481449577e-06

Chi2: 291.3420130772467, p-value: 7.984173381155626e-62

Chi2: 766.5697029458893, p-value: 0.01152729601163775

X = df[["sex", "age", "fare", "pclass", "sibsp", "parch", "embarked", "title"]]
y = df["survived"]

print("X: ", X.shape)
print("y: ", y.shape)

X:  (891, 8)
y:  (891,)

X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype   
---  ------    --------------  -----   
 0   sex       891 non-null    category
 1   age       891 non-null    float64 
 2   fare      891 non-null    float64 
 3   pclass    891 non-null    category
 4   sibsp     891 non-null    int64   
 5   parch     891 non-null    int64   
 6   embarked  891 non-null    category
 7   title     891 non-null    category
dtypes: category(4), float64(2), int64(2)
memory usage: 32.0 KB

from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer

numerical_cols = X.select_dtypes(include=["int64", "float64"]).columns
print(numerical_cols)  # ['age', 'fare', 'sibsp', 'parch']
categorical_cols = X.select_dtypes("category").columns
print(categorical_cols)  # ['sex', 'pclass', 'embarked', 'title']

Index(['age', 'fare', 'sibsp', 'parch'], dtype='object')
Index(['sex', 'pclass', 'embarked', 'title'], dtype='object')

categorical_pipeline = Pipeline(
    [
        ("imputer", SimpleImputer(strategy="most_frequent")),
        ("encoder", OneHotEncoder(drop="first", handle_unknown="ignore")),
    ]
)
numerical_pipeline = Pipeline(
    [
        ("imputer", SimpleImputer(strategy="mean")),
        (
            "scaler",
            StandardScaler(),
        ),
    ]
)
preprocessor = ColumnTransformer(
    transformers=[
        ("num", numerical_pipeline, numerical_cols),
        ("cat", categorical_pipeline, categorical_cols),
    ]
)

# Final pipeline
model_pipeline = Pipeline(
    steps=[
        ("preprocessing", preprocessor),
        # ("classifier", RandomForestClassifier(random_state=42)),
        ("classifier", LogisticRegression(random_state=42)),
        # ("classifier", DecisionTreeClassifier(random_state=42)),
    ]
)

model_pipeline

Pipeline(steps=[('preprocessing',
                 ColumnTransformer(transformers=[('num',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer()),
                                                                  ('scaler',
                                                                   StandardScaler())]),
                                                  Index(['age', 'fare', 'sibsp', 'parch'], dtype='object')),
                                                 ('cat',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(strategy='most_frequent')),
                                                                  ('encoder',
                                                                   OneHotEncoder(drop='first',
                                                                                 handle_unknown='ignore'))]),
                                                  Index(['sex', 'pclass', 'embarked', 'title'], dtype='object'))])),
                ('classifier', LogisticRegression(random_state=42))])

Index(['age', 'fare', 'sibsp', 'parch'], dtype='object')

Index(['sex', 'pclass', 'embarked', 'title'], dtype='object')

### Split the data into features and target variable
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=42,
    stratify=y,
)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)
gc.collect()

(712, 8) (179, 8) (712,) (179,)

0

model = model_pipeline.named_steps["classifier"]
model

LogisticRegression(random_state=42)

X_train_processed = preprocessor.fit_transform(X_train)
X_test_processed = preprocessor.transform(X_test)

encoded_columns = (
    model_pipeline[0]
    .transformers_[1][1]
    .named_steps["encoder"]
    .get_feature_names_out(categorical_cols)
)
# Combine the numerical columns and encoded categorical columns
all_columns = list(numerical_cols) + list(encoded_columns)

# Convert to DataFrame
X_train_df = pd.DataFrame(X_train_processed, columns=all_columns)
X_test_df = pd.DataFrame(X_test_processed, columns=all_columns)
X_train_df.head(10)

model.fit(X_train_df, y_train)

LogisticRegression(random_state=42)

y_pred = model_pipeline.predict(X_test)
y_proba = model_pipeline.predict_proba(X_test)[:, 1]
print(y_pred[:10])
print(y_proba[:10])

[0 0 0 0 1 1 1 0 0 0]
[0.05099579 0.04055293 0.15135114 0.02703658 0.66735637 0.60975207
 0.7190783  0.49081549 0.26258101 0.12986952]

from sklearn.metrics import (
    classification_report,
    confusion_matrix,
    accuracy_score,
    roc_curve,
    roc_auc_score,
    precision_recall_curve,
    average_precision_score,
)

print("Classification Report: \n", classification_report(y_test, y_pred))
print("Model Accuracy: ", f"{accuracy_score(y_test, y_pred) * 100:.2f}%")

Classification Report: 
               precision    recall  f1-score   support

           0       0.84      0.90      0.87       110
           1       0.82      0.72      0.77        69

    accuracy                           0.83       179
   macro avg       0.83      0.81      0.82       179
weighted avg       0.83      0.83      0.83       179

Model Accuracy:  83.24%

cm = confusion_matrix(y_test, y_pred)
class_names = ["Not Survived", "Survived"]

# Plot
plt.figure(figsize=(8, 6))
sns.heatmap(
    cm,
    annot=True,
    fmt="d",
    cmap="Blues",
    xticklabels=class_names,
    yticklabels=class_names,
)

plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.title("Confusion Matrix")
plt.tight_layout()
plt.show()

fpr, tpr, thresholds = roc_curve(y_test, y_proba)
auc = roc_auc_score(y_test, y_proba)

plt.figure()
plt.plot(fpr, tpr, label=f"AUC = {auc:.2f}")
plt.plot([0, 1], [0, 1], 'k--')
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curve")
plt.legend()
plt.grid(True)
plt.show()

precision, recall, _ = precision_recall_curve(y_test, y_proba)
avg_precision = average_precision_score(y_test, y_proba)

plt.figure()
plt.plot(recall, precision, label=f"Avg Precision = {avg_precision:.2f}")
plt.plot([1, 0], [0, 1], 'k--')
plt.xlabel("Recall")
plt.ylabel("Precision")
plt.title("Precision-Recall Curve")
plt.legend()
plt.grid(True)
plt.show()

# import pickle

# # Save the pipeline to a file
# with open("models/model_pipeline.pkl", "wb") as f:
#     pickle.dump(model_pipeline, f)

X[:20]

# import pickle

# loaded_model = pickle.load(open("models/model_pipeline.pkl", "rb"))

# # Use it for prediction
# print("Model Predictions: ", loaded_model.predict(X[:20]))
# print("Original Labels: ", y[:20].values)
# print("Model Probabilities: ", loaded_model.predict_proba(X[:20])[:, 1])
# print("Model Score: ", loaded_model.score(X[:20], y[:20]))

Model Predictions:  [0 1 1 1 0 0 0 0 1 1 1 1 0 0 0 1 0 0 0 1]
Original Labels:  [0 1 1 1 0 0 0 0 1 1 1 1 0 0 0 1 0 1 0 1]
Model Probabilities:  [0.08       0.99       0.7        1.         0.02170042 0.00333333
 0.03       0.12       0.88       0.98       0.91       0.77
 0.099      0.02       0.09       0.94       0.16       0.436
 0.26       0.9       ]
Model Score:  0.95

	passengerid	survived	pclass	name	sex	age	sibsp	ticket	fare	embarked	title
0	1	0	3	Braund, Mr. Owen Harris	male	22.0	1	A/5 21171	7.2500	S	Mr
1	2	1	1	Cumings, Mrs. John Bradley (Florence Briggs Th...	female	38.0	1	PC 17599	71.2833	C	Mrs
2	3	1	3	Heikkinen, Miss. Laina	female	26.0	0	STON/O2. 3101282	7.9250	S	Miss
3	4	1	1	Futrelle, Mrs. Jacques Heath (Lily May Peel)	female	35.0	1	113803	53.1000	S	Mrs
4	5	0	3	Allen, Mr. William Henry	male	35.0	0	373450	8.0500	S	Mr

	passengerid	survived	pclass	age	sibsp	parch	fare
count	891.000000	891.000000	891.000000	891.000000	891.000000	891.000000	891.000000
mean	446.000000	0.383838	2.308642	29.659001	0.523008	0.381594	32.204208
std	257.353842	0.486592	0.836071	13.289967	1.102743	0.806057	49.693429
min	1.000000	0.000000	1.000000	0.420000	0.000000	0.000000	0.000000
25%	223.500000	0.000000	2.000000	21.000000	0.000000	0.000000	7.910400
50%	446.000000	0.000000	3.000000	30.000000	0.000000	0.000000	14.454200
75%	668.500000	1.000000	3.000000	35.000000	1.000000	0.000000	31.000000
max	891.000000	1.000000	3.000000	80.000000	8.000000	6.000000	512.329200

survived	0	1
sex
female	81	233
male	468	109

survived	0	1
ticket
110152	0	3
110413	1	2
110465	2	0
110564	0	1
110813	0	1
...	...	...
W./C. 6608	4	0
W./C. 6609	1	0
W.E.P. 5734	1	0
W/C 14208	1	0
WE/P 5735	1	1

	transformers	[('num', ...), ('cat', ...)]
	remainder	'drop'
	sparse_threshold	0.3
	n_jobs	None
	transformer_weights	None
	verbose	False
	verbose_feature_names_out	True
	force_int_remainder_cols	'deprecated'

Importing necessary libraries¶

Loading the cleaned dataset from our previous work¶

Feature Selection¶

Dependent VS Independent Variables¶

Testing the Feature Relevance using Chi-Square Test¶

Split the data into features and target variable Dependent Vs Independent Variables¶

We need to convert categorical variables into numerical format¶

Split the data into features and target variable¶

Applying preprocessing steps and fitting the model¶

Generating Predictions on Test Data¶

Evaluating the Model¶

Saving the model¶

Loading the model¶

	steps	[('preprocessing', ...), ('classifier', ...)]
	transform_input	None
	memory	None
	verbose	False

	missing_values	nan
	strategy	'mean'
	fill_value	None
	copy	True
	add_indicator	False
	keep_empty_features	False

	missing_values	nan
	strategy	'most_frequent'
	fill_value	None
	copy	True
	add_indicator	False
	keep_empty_features	False

	categories	'auto'
	drop	'first'
	sparse_output	True
	dtype	<class 'numpy.float64'>
	handle_unknown	'ignore'
	min_frequency	None
	max_categories	None
	feature_name_combiner	'concat'

	penalty	'l2'
	dual	False
	tol	0.0001
	C	1.0
	fit_intercept	True
	intercept_scaling	1
	class_weight	None
	random_state	42
	solver	'lbfgs'
	max_iter	100
	multi_class	'deprecated'
	verbose	0
	warm_start	False
	n_jobs	None
	l1_ratio	None

	age	fare	sibsp	parch	sex_male	pclass_2	pclass_3	embarked_Q	embarked_S	title_Mr	title_Mrs
0	0.166879	0.513812	-0.465084	-0.466183	1.0	0.0	1.0	0.0	1.0	1.0	0.0
1	0.166879	-0.662563	-0.465084	-0.466183	1.0	1.0	0.0	0.0	1.0	1.0	0.0
2	0.166879	3.955399	-0.465084	-0.466183	1.0	0.0	0.0	0.0	1.0	1.0	0.0
3	-0.887551	-0.467874	-0.465084	0.727782	0.0	0.0	1.0	0.0	1.0	0.0	1.0
4	0.091562	-0.115977	0.478335	0.727782	0.0	1.0	0.0	0.0	1.0	0.0	1.0
5	-0.661602	-0.486962	-0.465084	-0.466183	1.0	0.0	1.0	0.0	1.0	1.0	0.0
6	-0.285020	0.513812	-0.465084	-0.466183	1.0	0.0	1.0	0.0	1.0	1.0	0.0
7	0.166879	4.075040	-0.465084	-0.466183	1.0	0.0	0.0	0.0	0.0	1.0	0.0
8	0.166879	-0.501190	-0.465084	-0.466183	1.0	0.0	1.0	1.0	0.0	1.0	0.0
9	0.091562	-0.287761	0.478335	-0.466183	0.0	0.0	1.0	0.0	1.0	0.0	1.0

survived	0	1
pclass
1	80	136
2	97	87
3	372	119

survived	0	1
embarked
C	75	93
Q	47	30
S	427	219

survived	0	1
title
Master	17	23
Miss	55	129
Mr	437	82
Mrs	26	103
Officer	14	5