-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathPCA.py
85 lines (63 loc) · 2.86 KB
/
PCA.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
# source https://towardsdatascience.com/visualising-the-classification-power-of-data-54f5273f640
# modified version by A. ANDREEV
import numpy as np
import pandas as pd
from sklearn.datasets import load_breast_cancer
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import sklearn.metrics as metric
import statsmodels.api as sm
cancer = load_breast_cancer()
df = pd.DataFrame(cancer['data'],columns=cancer['feature_names']) #len(cancer['feature_names']) = 30
df['y'] = cancer['target']
# 31 feaures/columns in 569 rows
print("Number of features: ", len(df.columns)) # y is also counted as feature!
print("Number of samples: ", df.shape[0])
#Scale the data (some features will tend to have higher variances because of their scale)
scaler = StandardScaler()
scaler.fit(df)
scaled = scaler.transform(df)
#Obtain principal components
pca = PCA().fit(scaled)
#Apply dimensionality reduction to scaled. scaled is projected on the first principal components previously extracted from a training set.
pc = pca.transform(scaled)
#so in the new features (that are not the same as the original ones)
#usually the first 2-3 contain most of the variance
pc1 = pc[:,0]
pc2 = pc[:,1]
#Plot principal components
plt.figure(figsize=(10,10))
colour = ['#ff2121' if y == 1 else '#2176ff' for y in df['y']]
plt.scatter(pc1,pc2 ,c=colour,edgecolors='#000000')
plt.ylabel("Glucose",size=20)
plt.xlabel('Age',size=20)
plt.yticks(size=12)
plt.xticks(size=12)
plt.xlabel('PC1')
plt.ylabel('PC2')
# scree plot of the principal componenets
var = pca.explained_variance_[0:10] #percentage of variance explained
labels = ['PC1','PC2','PC3','PC4','PC5','PC6','PC7','PC8','PC9','PC10']
plt.figure(figsize=(15,7))
plt.bar(labels,var,) #first param is horizontal
plt.xlabel('Pricipal Component')
plt.ylabel('Proportion of Variance Explained')
#using groups, testing which group of features would be more useful for making predictions
group_1 = ['mean symmetry', 'symmetry error','worst symmetry',
'mean smoothness','smoothness error','worst smoothness']
group_2 = ['mean perimeter','perimeter error','worst perimeter',
'mean concavity','concavity error','worst concavity']
#group 2 should have better classfication based on PCA scatter plot
groups = [group_1, group_2]
for i,g in enumerate(groups): #index, group
x = df[g]
x = sm.add_constant(x) #add a column (as first column) with ones
y = df['y']
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.3,
random_state = 101)
model = sm.Logit(y_train,x_train).fit() #fit logistic regression model
predictions = np.around(model.predict(x_test))
accuracy = metric.accuracy_score(y_test,predictions)
print("Accuracy of Group {}: {}".format(i+1,accuracy))