-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathClassProperties.py
135 lines (107 loc) · 4.34 KB
/
ClassProperties.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
"""
ClassProperties.py
Purpose:
Get mean and stdev properties for classes, and
give new label indices by SST
Input:
address = root location
n_comp = number of classes/components in GMM
Output:
- data frame with everything, including SST sorted labels
(saved as pickle file)
- old2new: new class indices (sorted by SST)
"""
# import relevant modules
import glob
import pandas as pd
import scipy as sp
import numpy as np
import matplotlib.pyplot as plt
import Print
import pickle
import csv
def main(address, runIndex, n_comp):
print("ClassProperties.main()")
# set paths
floc = address + 'Data_store/CentredAndUncentred/'
labloc_unsorted = address + 'Data_store/Labels/Labels_unsorted.csv'
frame_store = address + 'Objects/AllProfiles.pkl'
# find all csv files
allFiles = glob.glob(floc + "*.csv")
frame = pd.DataFrame()
# read in label data - now passed as an argument
df0 = pd.read_csv(labloc_unsorted, index_col=None, header=0)
df1 = df0['label']
labels = df1.values
# read depth levels
depths_retained = Print.readDepth(address, runIndex)
# load posterior probabilities for each class
class_number_array = None
class_number_array = np.arange(0,n_comp).reshape(-1,1)
lon_pp,lat_pp,dynHeight_pp,varTime_pp,post_prob = \
Print.readPosteriorProb(address, runIndex, class_number_array)
# read in T,S data. stack together with label data
list_ = []
for file_ in allFiles:
df = pd.read_csv(file_, index_col=None, header=0)
c = np.column_stack((df.values,labels))
list_.append(c)
# stack depths as new dimension / shape is (profile number, variable, depth)
# where variables are lon lat dynHeight Tint Tint_centred Sint Time Label
allOne = np.dstack(list_)
numberOfProfiles, numberOfVars, numberOfDepths = allOne.shape
# make a pandas dataframe that can be easily split
print('ClassProperties.main() : creating data frame (this may take a while)')
d = []
for i in range(0,numberOfProfiles):
for k in range(0,numberOfDepths):
d.append({'profile_index': i,
'depth_index': k,
'longitude': allOne[i,0,k],
'latitude': allOne[i,1,k],
'pressure': depths_retained[k],
'dynamic_height': allOne[i,2,k],
'temperature': allOne[i,3,k],
'temperature_standardized': allOne[i,4,k],
'salinity': allOne[i,5,k],
'time': allOne[i,6,k],
'class': int(allOne[i,7,k]),
'posterior_probability': np.max(post_prob[i,:])})
allDF = pd.DataFrame(d)
# clear some memory by getting rid of variables
del allOne
del post_prob
# read the pickle file data frame (for testing only)
# allDF = pd.read_pickle(frame_store, compression='infer')
# surface only
surfaceDF = allDF[allDF.pressure == 15]
surfaceDFg = surfaceDF.groupby(['class'])
# sea surface properties
T_means = allDF.groupby(['class'])['temperature'].mean()
SST_means = surfaceDFg['temperature'].mean()
SST_medians = surfaceDFg['temperature'].median()
# sort by temperature (these are the new class numbers)
old2new = np.argsort(T_means.values)
# construct dictionary to replace old class numbers with new ones
di = dict(zip(old2new,range(0,n_comp)))
# save dictionary to csv for later use
with open(address + 'Results/old2new.csv', 'w') as csvfile:
w = csv.DictWriter(csvfile, di.keys())
w.writerow(di)
# write to pickle file for later use
f = open(address + 'Results/old2new.pkl', 'wb')
pickle.dump(di,f)
f.close()
# add sorted class numbers as a new column
allDF['class_sorted']=allDF['class'].map(di)
# save allDF pickle object for later use
print('ClassProperties.main(): pickling data frame')
allDF.to_pickle(frame_store, compression='infer')
# write some summaries to csv
print('ClassProperties.main(): writing summaries')
allDFgrouped = allDF.groupby('class_sorted')
for column in allDF:
fname = address + 'Results/' + column + '_stats.csv'
tmp = allDFgrouped[column].describe()
tmp.to_csv(fname)
#######################################################################