From 18c577a571dc159392575335df0d42983542aa8f Mon Sep 17 00:00:00 2001
From: Simran Shaikh <simmoshaikh20.2003@gmail.com>
Date: Fri, 8 Nov 2024 22:49:01 +0530
Subject: [PATCH] added .py file #208

---
 ...tia_prediction_using_different_ml_model.py | 572 ++++++++++++++++++
 1 file changed, 572 insertions(+)
 create mode 100644 pages/dementia_prediction_using_different_ml_model.py

diff --git a/pages/dementia_prediction_using_different_ml_model.py b/pages/dementia_prediction_using_different_ml_model.py
new file mode 100644
index 00000000..83f56674
--- /dev/null
+++ b/pages/dementia_prediction_using_different_ml_model.py
@@ -0,0 +1,572 @@
+# -*- coding: utf-8 -*-
+"""dementia-prediction-using-different-ml-model.ipynb
+
+Automatically generated by Colab.
+
+Original file is located at
+    https://colab.research.google.com/drive/1oTgATmP3qQHo6gQkv680M9klmx95_zqO
+
+## Importing Libs
+"""
+
+# Commented out IPython magic to ensure Python compatibility.
+import pandas as pd # used to load, manipulate the data and for one-hot encoding
+import numpy as np # data manipulation
+# %matplotlib inline
+import matplotlib.pyplot as plt
+import matplotlib.colors as colors
+from sklearn.utils import resample # for downsample the dataset
+from sklearn.model_selection import train_test_split # for splitting the dataset into train and test split
+from sklearn.preprocessing import scale # scale and center the data
+from sklearn.svm import SVC # will make a SVM for classification
+from sklearn.model_selection import GridSearchCV # will do the cross validation
+from sklearn.metrics import plot_confusion_matrix # will draw the confusion matrix
+from sklearn.decomposition import PCA # to perform PCA to plot the data
+from sklearn.impute import SimpleImputer
+from sklearn.preprocessing import StandardScaler, MinMaxScaler
+from sklearn.model_selection import cross_val_score
+from sklearn.metrics import confusion_matrix, precision_score, accuracy_score, recall_score, roc_curve, auc
+import seaborn as sns
+
+"""## Load the data"""
+
+data = pd.read_csv("../input/mri-and-alzheimers/oasis_longitudinal.csv")
+
+"""## Explore the data"""
+
+pd.set_option('display.max_columns', None) # will show the all columns with pandas dataframe
+pd.set_option('display.max_rows', None) # will show the all rows with pandas dataframe
+
+data.head()
+# data.tail()
+# data.size
+
+data.shape
+
+data.info()
+
+"""## Converting Categorical Data to Numerical Data
+
+When **inplace = True** , the data is modified in place, which means it will return nothing and the dataframe is now updated.
+When **inplace = False** , which is the *default*, then the operation is performed and it returns a copy of the object. You then need to save it to something.
+
+set axis=0 for rows or, just put axis='rows' to access the rows
+
+set axis=1 for columns or, just put axis='columns' to access the columns
+"""
+
+data['M/F'] = [1 if each == "M" else 0 for each in data['M/F']]
+data['Group'] = [1 if each == "Demented" or each == "Converted" else 0 for each in data['Group']]
+# data['Group'] = data['Group'].replace(['Converted'], ['Demented']) # Target variable
+# data['Group'] = data['Group'].replace(['Demented', 'Nondemented'], [1,0]) # Target variable
+data.info()
+
+"""Note: Based on the given data **CDR** is used to tell what the condition of the patient meaning, does the patient has any dementia or, not.
+
+CDR Value Meaning:
+
+* 0 ---> Normal
+* 0.5 ---> Very Mild Dementia
+* 1 ---> Mild Dementia
+* 2 ---> Moderate Dementia
+* 3 ---> Severe Dementia
+
+## Correlation Between Attributes
+"""
+
+correlation_matrix = data.corr()
+data_corr = correlation_matrix['Group'].sort_values(ascending=False)
+data_corr
+
+from pandas.plotting import scatter_matrix
+
+attributes = ["Group", "CDR", "M/F", "SES", "ASF"]
+
+scatter_matrix(data[attributes], figsize=(15, 11), alpha=0.3)
+
+import plotly.express as px
+
+fig = px.scatter(data, x='Group', y='SES', color='Group')
+fig.show()
+
+import plotly.express as px
+
+fig = px.scatter(data, x='Group', y='Age', color='Group')
+fig.show()
+
+import plotly.express as px
+
+fig = px.scatter(data, x='Group', y='ASF', color='Group')
+fig.show()
+
+"""## Checking For Missig/Null Values"""
+
+data.isnull().sum()
+
+"""### Taking median values for the missing values of MMSE"""
+
+median = data['MMSE'].median()
+data['MMSE'].fillna(median, inplace=True)
+data.isnull().sum()
+
+"""### Taking median values for the missing values of SES"""
+
+median = data['SES'].median()
+data['SES'].fillna(median, inplace=True)
+data.isnull().sum()
+
+"""## Train-Test Split
+
+## Prepare the data for X and y where,
+
+1. X = The columns/features for **making the prediction**
+2. y = The **predicted value**
+"""
+
+y = data['Group'].values
+X = data[['M/F', 'Age', 'EDUC', 'SES', 'MMSE', 'eTIV', 'nWBV', 'ASF']]
+
+"""### Train-Test distribution Without Stratified Sampling"""
+
+# by default test_size= 0.25
+X_trainval, X_test, y_trainval, y_test = train_test_split(X, y, test_size= 0.20, random_state=42)
+
+df_ytrain = pd.DataFrame(y_trainval)
+df_ytest = pd.DataFrame(y_test)
+
+print('In Training Split:')
+print(df_ytrain[0].value_counts())
+
+print('\nIn Testing Split:')
+print(df_ytest[0].value_counts())
+
+"""### With Stratified Sampling"""
+
+# by default test_size= 0.25
+X_trainval, X_test, y_trainval, y_test = train_test_split(X, y, test_size= 0.20, random_state=42, stratify=y)
+
+
+df_ytrain = pd.DataFrame(y_trainval)
+df_ytest = pd.DataFrame(y_test)
+
+print('In Training Split:')
+print(df_ytrain[0].value_counts())
+
+print('\nIn Testing Split:')
+print(df_ytest[0].value_counts())
+
+"""### Scale the dataset"""
+
+# here StandardScaler() means z = (x - u) / s
+scaler = StandardScaler().fit(X_trainval)
+#scaler = MinMaxScaler().fit(X_trainval)
+X_trainval_scaled = scaler.transform(X_trainval)
+X_test_scaled = scaler.transform(X_test)
+
+X_trainval_scaled
+
+X_trainval.describe()
+
+"""## Data Visualization"""
+
+X_trainval.hist(bins=30, figsize=(20,15))
+plt.show()
+
+import plotly.express as px
+
+x = ['M/F', 'Age', 'EDUC', 'SES', 'MMSE', 'eTIV', 'nWBV', 'ASF']
+
+fig = px.histogram(X_trainval, x='eTIV', nbins=50)
+fig.show()
+
+import plotly.express as px
+
+x = ['M/F', 'Age', 'EDUC', 'SES', 'MMSE', 'eTIV', 'nWBV', 'ASF']
+
+fig = px.scatter(X_trainval, x='eTIV')
+fig.show()
+
+"""# SVM"""
+
+clf_svm = SVC(random_state=42)
+clf_svm.fit(X_trainval_scaled, y_trainval)
+
+# for test there are 94 cases
+plot_confusion_matrix(clf_svm,
+                      X_test_scaled,
+                      y_test,
+                      values_format='d',
+                      display_labels=['Nondemented', 'Demented'])
+
+train_score = 0
+test_score = 0
+test_recall = 0
+test_auc = 0
+
+train_score = clf_svm.score(X_trainval_scaled, y_trainval)
+test_score = clf_svm.score(X_test_scaled, y_test)
+y_predict = clf_svm.predict(X_test_scaled)
+
+test_recall = recall_score(y_test, y_predict)
+fpr, tpr, thresholds = roc_curve(y_test, y_predict)
+test_auc = auc(fpr, tpr)
+
+
+print("Train accuracy ", train_score)
+print("Test accuracy ", test_score)
+print("Test recall", test_recall)
+print("Test AUC", test_auc)
+
+"""### Optimize parameters(Finetuning) --> GridSearchCV() for SVM"""
+
+# Normally, C = 1 and gamma = 'scale' are default values
+# C controls how wide the margin will be with respect to how many misclassification we are allowing
+# C is increasing --> reduce the size of the margin and fewer misclassification and vice versa
+param_grid = [
+    {'C': [0.5, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 30, 50, 80, 100],
+    'gamma': ['scale', 0.5, 1, 0.1, 0.01, 0.001, 0.0001, 0.00001],
+    'kernel': ['rbf', 'linear', 'poly', 'sigmoid']},
+]
+
+optimal_params = GridSearchCV(SVC(),
+                             param_grid,
+                             cv=5, # we are taking 5-fold as in k-fold cross validation
+                             scoring='accuracy', # try the other scoring if have time
+                             verbose=0,
+                             n_jobs=-1)
+
+optimal_params.fit(X_trainval_scaled, y_trainval)
+print(optimal_params.best_params_)
+
+C = optimal_params.best_params_['C']
+gamma = optimal_params.best_params_['gamma']
+kernel = optimal_params.best_params_['kernel']
+
+clf_svm = SVC(random_state=42, C=C, gamma=gamma, kernel=kernel)
+clf_svm.fit(X_trainval_scaled, y_trainval)
+
+plot_confusion_matrix(clf_svm,
+                      X_test_scaled,
+                      y_test,
+                      values_format='d',
+                      display_labels=['Nondemented', 'Demented'])
+
+train_score = 0
+test_score = 0
+test_recall = 0
+test_auc = 0
+
+train_score = clf_svm.score(X_trainval_scaled, y_trainval)
+test_score = clf_svm.score(X_test_scaled, y_test)
+y_predict = clf_svm.predict(X_test_scaled)
+
+test_recall = recall_score(y_test, y_predict)
+svm_fpr, svm_tpr, thresholds = roc_curve(y_test, y_predict)
+test_auc = auc(svm_fpr, svm_tpr)
+
+
+print("Train accuracy ", train_score)
+print("Test accuracy ", test_score)
+print("Test recall", test_recall)
+print("Test AUC", test_auc)
+
+"""# Random Forest"""
+
+from sklearn.ensemble import RandomForestClassifier
+
+# n_estimators(M) --> the number of trees in the forest
+# max_features(d) --> the number of features to consider when looking for the best split
+# max_depth(m) --> the maximum depth of the tree.
+
+rfc = RandomForestClassifier(random_state=42)
+rfc.fit(X_trainval_scaled, y_trainval)
+
+# for test there are 94 cases
+plot_confusion_matrix(rfc,
+                      X_test_scaled,
+                      y_test,
+                      values_format='d',
+                      display_labels=['Nondemented', 'Demented'])
+
+train_score = 0
+test_score = 0
+test_recall = 0
+test_auc = 0
+
+train_score = rfc.score(X_trainval_scaled, y_trainval)
+test_score = rfc.score(X_test_scaled, y_test)
+y_predict = rfc.predict(X_test_scaled)
+test_recall = recall_score(y_test, y_predict)
+fpr, tpr, thresholds = roc_curve(y_test, y_predict)
+test_auc = auc(fpr, tpr)
+
+print("Train accuracy ", train_score)
+print("Test accuracy ", test_score)
+print("Test recall", test_recall)
+print("Test AUC", test_auc)
+
+"""### Optimize parameters(Finetuning) --> GridSearchCV()"""
+
+# Number of trees in random forest
+n_estimators = [int(x) for x in np.linspace(start = 10, stop = 100, num = 10)]
+
+# Number of features to consider at every split
+max_features = ['auto', 'sqrt', 'log2']
+
+# Maximum number of levels in tree
+max_depth = range(1,10)
+
+# measure the quality of a split
+criterion = ['gini']
+
+# Method of selecting samples for training each tree
+bootstrap = [True, False]
+
+# Create the param grid
+param_grid = {'n_estimators': n_estimators,
+               'max_features': max_features,
+               'max_depth': max_depth,
+               'criterion': criterion,
+               'bootstrap': bootstrap}
+
+optimal_params = GridSearchCV(RandomForestClassifier(),
+                             param_grid,
+                             cv=5, # we are taking 5-fold as in k-fold cross validation
+                             scoring='accuracy', # try the other scoring if have time
+                             verbose=0,
+                             n_jobs=-1)
+
+optimal_params.fit(X_trainval_scaled, y_trainval)
+print(optimal_params.best_params_)
+
+bootstrap = optimal_params.best_params_['bootstrap']
+criterion = optimal_params.best_params_['criterion']
+max_depth = optimal_params.best_params_['max_depth']
+max_features = optimal_params.best_params_['max_features']
+n_estimators = optimal_params.best_params_['n_estimators']
+
+rfc = RandomForestClassifier(n_estimators=n_estimators,
+                             max_features=max_features,
+                             max_depth=max_depth,
+                             criterion=criterion,
+                             bootstrap=bootstrap,
+                             random_state=42)
+
+rfc.fit(X_trainval_scaled, y_trainval)
+
+# for test there are 94 cases
+plot_confusion_matrix(rfc,
+                      X_test_scaled,
+                      y_test,
+                      values_format='d',
+                      display_labels=['Nondemented', 'Demented'])
+
+train_score = 0
+test_score = 0
+test_recall = 0
+test_auc = 0
+
+train_score = rfc.score(X_trainval_scaled, y_trainval)
+test_score = rfc.score(X_test_scaled, y_test)
+y_predict = rfc.predict(X_test_scaled)
+test_recall = recall_score(y_test, y_predict)
+rfc_fpr, rfc_tpr, thresholds = roc_curve(y_test, y_predict)
+test_auc = auc(rfc_fpr, rfc_tpr)
+
+print("Train accuracy ", train_score)
+print("Test accuracy ", test_score)
+print("Test recall", test_recall)
+print("Test AUC", test_auc)
+
+"""# Logistic Regression"""
+
+from sklearn.linear_model import LogisticRegression
+from sklearn.tree import DecisionTreeClassifier
+from sklearn.metrics import confusion_matrix, precision_score, accuracy_score, recall_score, roc_curve, auc
+
+log_reg_model = LogisticRegression().fit(X_trainval_scaled, y_trainval)
+
+# for test there are 94 cases
+plot_confusion_matrix(log_reg_model,
+                      X_test_scaled,
+                      y_test,
+                      values_format='d',
+                      display_labels=['Nondemented', 'Demented'])
+
+train_score = 0
+test_score = 0
+test_recall = 0
+test_auc = 0
+
+log_reg_model = LogisticRegression().fit(X_trainval_scaled, y_trainval)
+train_score = log_reg_model.score(X_trainval_scaled, y_trainval)
+test_score = log_reg_model.score(X_test_scaled, y_test)
+scores = log_reg_model.score(X_test_scaled, y_test)
+y_predict = log_reg_model.predict(X_test_scaled)
+test_recall = recall_score(y_test, y_predict)
+fpr, tpr, thresholds = roc_curve(y_test, y_predict)
+test_auc = auc(fpr, tpr)
+
+
+print("Train accuracy ", train_score)
+print("Test accuracy ", test_score)
+print("Test recall", test_recall)
+print("Test AUC", test_auc)
+
+"""### Optimize parameters(Finetuning) --> GridSearchCV()"""
+
+param_grid = {'penalty': ['l1','l2'],
+               'C': [0.001,0.01,0.1,1, 2, 3, 5, 10,100,1000]}
+
+optimal_params = GridSearchCV(LogisticRegression(),
+                             param_grid,
+                             cv=5, # we are taking 5-fold as in k-fold cross validation
+                             scoring='accuracy', # try the other scoring if have time
+                             verbose=0,
+                             n_jobs=-1)
+
+optimal_params.fit(X_trainval_scaled, y_trainval)
+print(optimal_params.best_params_)
+
+# best_score = -10
+# for c in range(1, 20):
+#         log_reg_model = LogisticRegression(C=c)
+#         scores = cross_val_score(log_reg_model, X_trainval_scaled, y_trainval, cv=5, scoring='accuracy')
+
+#         mean_score = scores.mean()
+
+#         if mean_score > best_score:
+#             best_score = mean_score
+#             best_c = c
+# print(best_c)
+
+best_C = optimal_params.best_params_['C']
+best_penalty = optimal_params.best_params_['penalty']
+
+log_reg_model = LogisticRegression(C=best_C, penalty=best_penalty).fit(X_trainval_scaled, y_trainval)
+
+# for test there are 94 cases
+plot_confusion_matrix(log_reg_model,
+                      X_test_scaled,
+                      y_test,
+                      values_format='d',
+                      display_labels=['Nondemented', 'Demented'])
+
+train_score = 0
+test_score = 0
+test_recall = 0
+test_auc = 0
+
+best_log_reg_model = LogisticRegression(C=best_C, penalty=best_penalty).fit(X_trainval_scaled, y_trainval)
+train_score = best_log_reg_model.score(X_trainval_scaled, y_trainval)
+test_score = best_log_reg_model.score(X_test_scaled, y_test)
+y_predict = best_log_reg_model.predict(X_test_scaled)
+test_recall = recall_score(y_test, y_predict)
+lgr_fpr, lgr_tpr, thresholds = roc_curve(y_test, y_predict)
+test_auc = auc(lgr_fpr, lgr_tpr)
+
+print("Train accuracy with Logistec regression:", train_score)
+print("Test accuracy with Logistec regression:", test_score)
+print("Test recall with Logistec regression:", test_recall)
+print("Test AUC with Logistec regression:", test_auc)
+
+"""# Decision Tree"""
+
+dt_model = DecisionTreeClassifier().fit(X_trainval_scaled, y_trainval)
+
+# for test there are 94 cases
+plot_confusion_matrix(dt_model,
+                      X_test_scaled,
+                      y_test,
+                      values_format='d',
+                      display_labels=['Nondemented', 'Demented'])
+
+train_score = 0
+test_score = 0
+test_recall = 0
+test_auc = 0
+
+dt_model = DecisionTreeClassifier().fit(X_trainval_scaled, y_trainval)
+train_score = dt_model.score(X_trainval_scaled, y_trainval)
+test_score = dt_model.score(X_test_scaled, y_test)
+y_predict = dt_model.predict(X_test_scaled)
+test_recall = recall_score(y_test, y_predict)
+fpr, tpr, thresholds = roc_curve(y_test, y_predict)
+test_auc = auc(fpr, tpr)
+
+print("Train accuracy with DecisionTreeClassifier:", train_score)
+print("Test accuracy with DecisionTreeClassifier:", test_score)
+print("Test recall with DecisionTreeClassifier:", test_recall)
+print("Test AUC with DecisionTreeClassifier:", test_auc)
+
+"""### Optimize parameters(Finetuning) --> GridSearchCV()"""
+
+param_grid = {'criterion': ['gini'],
+              'max_depth': range(1,10)}
+
+optimal_params = GridSearchCV(DecisionTreeClassifier(),
+                             param_grid,
+                             cv=5, # we are taking 5-fold as in k-fold cross validation
+                             scoring='accuracy', # try the other scoring if have time
+                             verbose=0,
+                             n_jobs=-1)
+
+optimal_params.fit(X_trainval_scaled, y_trainval)
+print(optimal_params.best_params_)
+
+criterion = optimal_params.best_params_['criterion']
+max_depth = optimal_params.best_params_['max_depth']
+
+# best_score = -1
+# for d in range(1, 25):
+#         dt_model = DecisionTreeClassifier(max_depth = d)
+#         scores = cross_val_score(dt_model, X_trainval_scaled, y_trainval, cv=5, scoring='accuracy')
+
+#         mean_score = scores.mean()
+
+#         if mean_score > best_score:
+#             best_score = mean_score
+#             best_d = d
+# print(best_d)
+
+dt_model = DecisionTreeClassifier(criterion=criterion, max_depth=max_depth).fit(X_trainval_scaled, y_trainval)
+
+# for test there are 94 cases
+plot_confusion_matrix(dt_model,
+                      X_test_scaled,
+                      y_test,
+                      values_format='d',
+                      display_labels=['Nondemented', 'Demented'])
+
+train_score = 0
+test_score = 0
+test_recall = 0
+test_auc = 0
+
+dt_model = DecisionTreeClassifier(criterion=criterion, max_depth=max_depth).fit(X_trainval_scaled, y_trainval)
+train_score = dt_model.score(X_trainval_scaled, y_trainval)
+test_score = dt_model.score(X_test_scaled, y_test)
+y_predict = dt_model.predict(X_test_scaled)
+test_recall = recall_score(y_test, y_predict)
+dt_fpr, dt_tpr, thresholds = roc_curve(y_test, y_predict)
+test_auc = auc(dt_fpr, dt_tpr)
+
+print("Train accuracy with DecisionTreeClassifier:", train_score)
+print("Test accuracy with DecisionTreeClassifier:", test_score)
+print("Test recall with DecisionTreeClassifier:", test_recall)
+print("Test AUC with DecisionTreeClassifier:", test_auc)
+
+"""## Plot ROC and compare AUC"""
+
+plt.figure(figsize=(5, 5), dpi=100)
+plt.plot(svm_fpr, svm_tpr, linestyle='-', label='SVM')
+plt.plot(lgr_fpr, lgr_tpr, marker='.', label='Logistic')
+plt.plot(rfc_fpr, rfc_tpr, linestyle=':', label='Random Forest')
+plt.plot(dt_fpr, dt_tpr, linestyle='-.', label='Decision Tree')
+
+plt.xlabel('False Positive Rate')
+plt.ylabel('True Positive Rate')
+
+plt.legend()
+
+plt.show()
\ No newline at end of file