From 18c577a571dc159392575335df0d42983542aa8f Mon Sep 17 00:00:00 2001 From: Simran Shaikh Date: Fri, 8 Nov 2024 22:49:01 +0530 Subject: [PATCH] added .py file #208 --- ...tia_prediction_using_different_ml_model.py | 572 ++++++++++++++++++ 1 file changed, 572 insertions(+) create mode 100644 pages/dementia_prediction_using_different_ml_model.py diff --git a/pages/dementia_prediction_using_different_ml_model.py b/pages/dementia_prediction_using_different_ml_model.py new file mode 100644 index 00000000..83f56674 --- /dev/null +++ b/pages/dementia_prediction_using_different_ml_model.py @@ -0,0 +1,572 @@ +# -*- coding: utf-8 -*- +"""dementia-prediction-using-different-ml-model.ipynb + +Automatically generated by Colab. + +Original file is located at + https://colab.research.google.com/drive/1oTgATmP3qQHo6gQkv680M9klmx95_zqO + +## Importing Libs +""" + +# Commented out IPython magic to ensure Python compatibility. +import pandas as pd # used to load, manipulate the data and for one-hot encoding +import numpy as np # data manipulation +# %matplotlib inline +import matplotlib.pyplot as plt +import matplotlib.colors as colors +from sklearn.utils import resample # for downsample the dataset +from sklearn.model_selection import train_test_split # for splitting the dataset into train and test split +from sklearn.preprocessing import scale # scale and center the data +from sklearn.svm import SVC # will make a SVM for classification +from sklearn.model_selection import GridSearchCV # will do the cross validation +from sklearn.metrics import plot_confusion_matrix # will draw the confusion matrix +from sklearn.decomposition import PCA # to perform PCA to plot the data +from sklearn.impute import SimpleImputer +from sklearn.preprocessing import StandardScaler, MinMaxScaler +from sklearn.model_selection import cross_val_score +from sklearn.metrics import confusion_matrix, precision_score, accuracy_score, recall_score, roc_curve, auc +import seaborn as sns + +"""## Load the data""" + +data = pd.read_csv("../input/mri-and-alzheimers/oasis_longitudinal.csv") + +"""## Explore the data""" + +pd.set_option('display.max_columns', None) # will show the all columns with pandas dataframe +pd.set_option('display.max_rows', None) # will show the all rows with pandas dataframe + +data.head() +# data.tail() +# data.size + +data.shape + +data.info() + +"""## Converting Categorical Data to Numerical Data + +When **inplace = True** , the data is modified in place, which means it will return nothing and the dataframe is now updated. +When **inplace = False** , which is the *default*, then the operation is performed and it returns a copy of the object. You then need to save it to something. + +set axis=0 for rows or, just put axis='rows' to access the rows + +set axis=1 for columns or, just put axis='columns' to access the columns +""" + +data['M/F'] = [1 if each == "M" else 0 for each in data['M/F']] +data['Group'] = [1 if each == "Demented" or each == "Converted" else 0 for each in data['Group']] +# data['Group'] = data['Group'].replace(['Converted'], ['Demented']) # Target variable +# data['Group'] = data['Group'].replace(['Demented', 'Nondemented'], [1,0]) # Target variable +data.info() + +"""Note: Based on the given data **CDR** is used to tell what the condition of the patient meaning, does the patient has any dementia or, not. + +CDR Value Meaning: + +* 0 ---> Normal +* 0.5 ---> Very Mild Dementia +* 1 ---> Mild Dementia +* 2 ---> Moderate Dementia +* 3 ---> Severe Dementia + +## Correlation Between Attributes +""" + +correlation_matrix = data.corr() +data_corr = correlation_matrix['Group'].sort_values(ascending=False) +data_corr + +from pandas.plotting import scatter_matrix + +attributes = ["Group", "CDR", "M/F", "SES", "ASF"] + +scatter_matrix(data[attributes], figsize=(15, 11), alpha=0.3) + +import plotly.express as px + +fig = px.scatter(data, x='Group', y='SES', color='Group') +fig.show() + +import plotly.express as px + +fig = px.scatter(data, x='Group', y='Age', color='Group') +fig.show() + +import plotly.express as px + +fig = px.scatter(data, x='Group', y='ASF', color='Group') +fig.show() + +"""## Checking For Missig/Null Values""" + +data.isnull().sum() + +"""### Taking median values for the missing values of MMSE""" + +median = data['MMSE'].median() +data['MMSE'].fillna(median, inplace=True) +data.isnull().sum() + +"""### Taking median values for the missing values of SES""" + +median = data['SES'].median() +data['SES'].fillna(median, inplace=True) +data.isnull().sum() + +"""## Train-Test Split + +## Prepare the data for X and y where, + +1. X = The columns/features for **making the prediction** +2. y = The **predicted value** +""" + +y = data['Group'].values +X = data[['M/F', 'Age', 'EDUC', 'SES', 'MMSE', 'eTIV', 'nWBV', 'ASF']] + +"""### Train-Test distribution Without Stratified Sampling""" + +# by default test_size= 0.25 +X_trainval, X_test, y_trainval, y_test = train_test_split(X, y, test_size= 0.20, random_state=42) + +df_ytrain = pd.DataFrame(y_trainval) +df_ytest = pd.DataFrame(y_test) + +print('In Training Split:') +print(df_ytrain[0].value_counts()) + +print('\nIn Testing Split:') +print(df_ytest[0].value_counts()) + +"""### With Stratified Sampling""" + +# by default test_size= 0.25 +X_trainval, X_test, y_trainval, y_test = train_test_split(X, y, test_size= 0.20, random_state=42, stratify=y) + + +df_ytrain = pd.DataFrame(y_trainval) +df_ytest = pd.DataFrame(y_test) + +print('In Training Split:') +print(df_ytrain[0].value_counts()) + +print('\nIn Testing Split:') +print(df_ytest[0].value_counts()) + +"""### Scale the dataset""" + +# here StandardScaler() means z = (x - u) / s +scaler = StandardScaler().fit(X_trainval) +#scaler = MinMaxScaler().fit(X_trainval) +X_trainval_scaled = scaler.transform(X_trainval) +X_test_scaled = scaler.transform(X_test) + +X_trainval_scaled + +X_trainval.describe() + +"""## Data Visualization""" + +X_trainval.hist(bins=30, figsize=(20,15)) +plt.show() + +import plotly.express as px + +x = ['M/F', 'Age', 'EDUC', 'SES', 'MMSE', 'eTIV', 'nWBV', 'ASF'] + +fig = px.histogram(X_trainval, x='eTIV', nbins=50) +fig.show() + +import plotly.express as px + +x = ['M/F', 'Age', 'EDUC', 'SES', 'MMSE', 'eTIV', 'nWBV', 'ASF'] + +fig = px.scatter(X_trainval, x='eTIV') +fig.show() + +"""# SVM""" + +clf_svm = SVC(random_state=42) +clf_svm.fit(X_trainval_scaled, y_trainval) + +# for test there are 94 cases +plot_confusion_matrix(clf_svm, + X_test_scaled, + y_test, + values_format='d', + display_labels=['Nondemented', 'Demented']) + +train_score = 0 +test_score = 0 +test_recall = 0 +test_auc = 0 + +train_score = clf_svm.score(X_trainval_scaled, y_trainval) +test_score = clf_svm.score(X_test_scaled, y_test) +y_predict = clf_svm.predict(X_test_scaled) + +test_recall = recall_score(y_test, y_predict) +fpr, tpr, thresholds = roc_curve(y_test, y_predict) +test_auc = auc(fpr, tpr) + + +print("Train accuracy ", train_score) +print("Test accuracy ", test_score) +print("Test recall", test_recall) +print("Test AUC", test_auc) + +"""### Optimize parameters(Finetuning) --> GridSearchCV() for SVM""" + +# Normally, C = 1 and gamma = 'scale' are default values +# C controls how wide the margin will be with respect to how many misclassification we are allowing +# C is increasing --> reduce the size of the margin and fewer misclassification and vice versa +param_grid = [ + {'C': [0.5, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 30, 50, 80, 100], + 'gamma': ['scale', 0.5, 1, 0.1, 0.01, 0.001, 0.0001, 0.00001], + 'kernel': ['rbf', 'linear', 'poly', 'sigmoid']}, +] + +optimal_params = GridSearchCV(SVC(), + param_grid, + cv=5, # we are taking 5-fold as in k-fold cross validation + scoring='accuracy', # try the other scoring if have time + verbose=0, + n_jobs=-1) + +optimal_params.fit(X_trainval_scaled, y_trainval) +print(optimal_params.best_params_) + +C = optimal_params.best_params_['C'] +gamma = optimal_params.best_params_['gamma'] +kernel = optimal_params.best_params_['kernel'] + +clf_svm = SVC(random_state=42, C=C, gamma=gamma, kernel=kernel) +clf_svm.fit(X_trainval_scaled, y_trainval) + +plot_confusion_matrix(clf_svm, + X_test_scaled, + y_test, + values_format='d', + display_labels=['Nondemented', 'Demented']) + +train_score = 0 +test_score = 0 +test_recall = 0 +test_auc = 0 + +train_score = clf_svm.score(X_trainval_scaled, y_trainval) +test_score = clf_svm.score(X_test_scaled, y_test) +y_predict = clf_svm.predict(X_test_scaled) + +test_recall = recall_score(y_test, y_predict) +svm_fpr, svm_tpr, thresholds = roc_curve(y_test, y_predict) +test_auc = auc(svm_fpr, svm_tpr) + + +print("Train accuracy ", train_score) +print("Test accuracy ", test_score) +print("Test recall", test_recall) +print("Test AUC", test_auc) + +"""# Random Forest""" + +from sklearn.ensemble import RandomForestClassifier + +# n_estimators(M) --> the number of trees in the forest +# max_features(d) --> the number of features to consider when looking for the best split +# max_depth(m) --> the maximum depth of the tree. + +rfc = RandomForestClassifier(random_state=42) +rfc.fit(X_trainval_scaled, y_trainval) + +# for test there are 94 cases +plot_confusion_matrix(rfc, + X_test_scaled, + y_test, + values_format='d', + display_labels=['Nondemented', 'Demented']) + +train_score = 0 +test_score = 0 +test_recall = 0 +test_auc = 0 + +train_score = rfc.score(X_trainval_scaled, y_trainval) +test_score = rfc.score(X_test_scaled, y_test) +y_predict = rfc.predict(X_test_scaled) +test_recall = recall_score(y_test, y_predict) +fpr, tpr, thresholds = roc_curve(y_test, y_predict) +test_auc = auc(fpr, tpr) + +print("Train accuracy ", train_score) +print("Test accuracy ", test_score) +print("Test recall", test_recall) +print("Test AUC", test_auc) + +"""### Optimize parameters(Finetuning) --> GridSearchCV()""" + +# Number of trees in random forest +n_estimators = [int(x) for x in np.linspace(start = 10, stop = 100, num = 10)] + +# Number of features to consider at every split +max_features = ['auto', 'sqrt', 'log2'] + +# Maximum number of levels in tree +max_depth = range(1,10) + +# measure the quality of a split +criterion = ['gini'] + +# Method of selecting samples for training each tree +bootstrap = [True, False] + +# Create the param grid +param_grid = {'n_estimators': n_estimators, + 'max_features': max_features, + 'max_depth': max_depth, + 'criterion': criterion, + 'bootstrap': bootstrap} + +optimal_params = GridSearchCV(RandomForestClassifier(), + param_grid, + cv=5, # we are taking 5-fold as in k-fold cross validation + scoring='accuracy', # try the other scoring if have time + verbose=0, + n_jobs=-1) + +optimal_params.fit(X_trainval_scaled, y_trainval) +print(optimal_params.best_params_) + +bootstrap = optimal_params.best_params_['bootstrap'] +criterion = optimal_params.best_params_['criterion'] +max_depth = optimal_params.best_params_['max_depth'] +max_features = optimal_params.best_params_['max_features'] +n_estimators = optimal_params.best_params_['n_estimators'] + +rfc = RandomForestClassifier(n_estimators=n_estimators, + max_features=max_features, + max_depth=max_depth, + criterion=criterion, + bootstrap=bootstrap, + random_state=42) + +rfc.fit(X_trainval_scaled, y_trainval) + +# for test there are 94 cases +plot_confusion_matrix(rfc, + X_test_scaled, + y_test, + values_format='d', + display_labels=['Nondemented', 'Demented']) + +train_score = 0 +test_score = 0 +test_recall = 0 +test_auc = 0 + +train_score = rfc.score(X_trainval_scaled, y_trainval) +test_score = rfc.score(X_test_scaled, y_test) +y_predict = rfc.predict(X_test_scaled) +test_recall = recall_score(y_test, y_predict) +rfc_fpr, rfc_tpr, thresholds = roc_curve(y_test, y_predict) +test_auc = auc(rfc_fpr, rfc_tpr) + +print("Train accuracy ", train_score) +print("Test accuracy ", test_score) +print("Test recall", test_recall) +print("Test AUC", test_auc) + +"""# Logistic Regression""" + +from sklearn.linear_model import LogisticRegression +from sklearn.tree import DecisionTreeClassifier +from sklearn.metrics import confusion_matrix, precision_score, accuracy_score, recall_score, roc_curve, auc + +log_reg_model = LogisticRegression().fit(X_trainval_scaled, y_trainval) + +# for test there are 94 cases +plot_confusion_matrix(log_reg_model, + X_test_scaled, + y_test, + values_format='d', + display_labels=['Nondemented', 'Demented']) + +train_score = 0 +test_score = 0 +test_recall = 0 +test_auc = 0 + +log_reg_model = LogisticRegression().fit(X_trainval_scaled, y_trainval) +train_score = log_reg_model.score(X_trainval_scaled, y_trainval) +test_score = log_reg_model.score(X_test_scaled, y_test) +scores = log_reg_model.score(X_test_scaled, y_test) +y_predict = log_reg_model.predict(X_test_scaled) +test_recall = recall_score(y_test, y_predict) +fpr, tpr, thresholds = roc_curve(y_test, y_predict) +test_auc = auc(fpr, tpr) + + +print("Train accuracy ", train_score) +print("Test accuracy ", test_score) +print("Test recall", test_recall) +print("Test AUC", test_auc) + +"""### Optimize parameters(Finetuning) --> GridSearchCV()""" + +param_grid = {'penalty': ['l1','l2'], + 'C': [0.001,0.01,0.1,1, 2, 3, 5, 10,100,1000]} + +optimal_params = GridSearchCV(LogisticRegression(), + param_grid, + cv=5, # we are taking 5-fold as in k-fold cross validation + scoring='accuracy', # try the other scoring if have time + verbose=0, + n_jobs=-1) + +optimal_params.fit(X_trainval_scaled, y_trainval) +print(optimal_params.best_params_) + +# best_score = -10 +# for c in range(1, 20): +# log_reg_model = LogisticRegression(C=c) +# scores = cross_val_score(log_reg_model, X_trainval_scaled, y_trainval, cv=5, scoring='accuracy') + +# mean_score = scores.mean() + +# if mean_score > best_score: +# best_score = mean_score +# best_c = c +# print(best_c) + +best_C = optimal_params.best_params_['C'] +best_penalty = optimal_params.best_params_['penalty'] + +log_reg_model = LogisticRegression(C=best_C, penalty=best_penalty).fit(X_trainval_scaled, y_trainval) + +# for test there are 94 cases +plot_confusion_matrix(log_reg_model, + X_test_scaled, + y_test, + values_format='d', + display_labels=['Nondemented', 'Demented']) + +train_score = 0 +test_score = 0 +test_recall = 0 +test_auc = 0 + +best_log_reg_model = LogisticRegression(C=best_C, penalty=best_penalty).fit(X_trainval_scaled, y_trainval) +train_score = best_log_reg_model.score(X_trainval_scaled, y_trainval) +test_score = best_log_reg_model.score(X_test_scaled, y_test) +y_predict = best_log_reg_model.predict(X_test_scaled) +test_recall = recall_score(y_test, y_predict) +lgr_fpr, lgr_tpr, thresholds = roc_curve(y_test, y_predict) +test_auc = auc(lgr_fpr, lgr_tpr) + +print("Train accuracy with Logistec regression:", train_score) +print("Test accuracy with Logistec regression:", test_score) +print("Test recall with Logistec regression:", test_recall) +print("Test AUC with Logistec regression:", test_auc) + +"""# Decision Tree""" + +dt_model = DecisionTreeClassifier().fit(X_trainval_scaled, y_trainval) + +# for test there are 94 cases +plot_confusion_matrix(dt_model, + X_test_scaled, + y_test, + values_format='d', + display_labels=['Nondemented', 'Demented']) + +train_score = 0 +test_score = 0 +test_recall = 0 +test_auc = 0 + +dt_model = DecisionTreeClassifier().fit(X_trainval_scaled, y_trainval) +train_score = dt_model.score(X_trainval_scaled, y_trainval) +test_score = dt_model.score(X_test_scaled, y_test) +y_predict = dt_model.predict(X_test_scaled) +test_recall = recall_score(y_test, y_predict) +fpr, tpr, thresholds = roc_curve(y_test, y_predict) +test_auc = auc(fpr, tpr) + +print("Train accuracy with DecisionTreeClassifier:", train_score) +print("Test accuracy with DecisionTreeClassifier:", test_score) +print("Test recall with DecisionTreeClassifier:", test_recall) +print("Test AUC with DecisionTreeClassifier:", test_auc) + +"""### Optimize parameters(Finetuning) --> GridSearchCV()""" + +param_grid = {'criterion': ['gini'], + 'max_depth': range(1,10)} + +optimal_params = GridSearchCV(DecisionTreeClassifier(), + param_grid, + cv=5, # we are taking 5-fold as in k-fold cross validation + scoring='accuracy', # try the other scoring if have time + verbose=0, + n_jobs=-1) + +optimal_params.fit(X_trainval_scaled, y_trainval) +print(optimal_params.best_params_) + +criterion = optimal_params.best_params_['criterion'] +max_depth = optimal_params.best_params_['max_depth'] + +# best_score = -1 +# for d in range(1, 25): +# dt_model = DecisionTreeClassifier(max_depth = d) +# scores = cross_val_score(dt_model, X_trainval_scaled, y_trainval, cv=5, scoring='accuracy') + +# mean_score = scores.mean() + +# if mean_score > best_score: +# best_score = mean_score +# best_d = d +# print(best_d) + +dt_model = DecisionTreeClassifier(criterion=criterion, max_depth=max_depth).fit(X_trainval_scaled, y_trainval) + +# for test there are 94 cases +plot_confusion_matrix(dt_model, + X_test_scaled, + y_test, + values_format='d', + display_labels=['Nondemented', 'Demented']) + +train_score = 0 +test_score = 0 +test_recall = 0 +test_auc = 0 + +dt_model = DecisionTreeClassifier(criterion=criterion, max_depth=max_depth).fit(X_trainval_scaled, y_trainval) +train_score = dt_model.score(X_trainval_scaled, y_trainval) +test_score = dt_model.score(X_test_scaled, y_test) +y_predict = dt_model.predict(X_test_scaled) +test_recall = recall_score(y_test, y_predict) +dt_fpr, dt_tpr, thresholds = roc_curve(y_test, y_predict) +test_auc = auc(dt_fpr, dt_tpr) + +print("Train accuracy with DecisionTreeClassifier:", train_score) +print("Test accuracy with DecisionTreeClassifier:", test_score) +print("Test recall with DecisionTreeClassifier:", test_recall) +print("Test AUC with DecisionTreeClassifier:", test_auc) + +"""## Plot ROC and compare AUC""" + +plt.figure(figsize=(5, 5), dpi=100) +plt.plot(svm_fpr, svm_tpr, linestyle='-', label='SVM') +plt.plot(lgr_fpr, lgr_tpr, marker='.', label='Logistic') +plt.plot(rfc_fpr, rfc_tpr, linestyle=':', label='Random Forest') +plt.plot(dt_fpr, dt_tpr, linestyle='-.', label='Decision Tree') + +plt.xlabel('False Positive Rate') +plt.ylabel('True Positive Rate') + +plt.legend() + +plt.show() \ No newline at end of file