-
Notifications
You must be signed in to change notification settings - Fork 5
/
Copy pathmy_model_selectors.py
351 lines (294 loc) · 15.4 KB
/
my_model_selectors.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
import math
import statistics
import warnings
import numpy as np
from hmmlearn.hmm import GaussianHMM
from sklearn.model_selection import KFold
from asl_utils import combine_sequences
import logging
class ModelSelector(object):
'''
base class for model selection (strategy design pattern)
'''
def __init__(self,
all_word_sequences: dict,
all_word_Xlengths: dict,
this_word: str,
n_constant=3,
min_n_components=2,
max_n_components=10,
random_state=14, verbose=False):
self.words = all_word_sequences
self.hwords = all_word_Xlengths
self.sequences = all_word_sequences[this_word]
self.X, self.lengths = all_word_Xlengths[this_word]
self.this_word = this_word
self.n_constant = n_constant
self.min_n_components = min_n_components
self.max_n_components = max_n_components
self.random_state = random_state
self.verbose = verbose
def select(self):
raise NotImplementedError
def base_model(self, num_states):
# with warnings.catch_warnings():
warnings.filterwarnings("ignore", category=DeprecationWarning)
# warnings.filterwarnings("ignore", category=RuntimeWarning)
try:
hmm_model = GaussianHMM(n_components=num_states, covariance_type="diag", n_iter=1000,
random_state=self.random_state, verbose=False).fit(self.X, self.lengths)
if self.verbose:
print("model created for {} with {} states".format(self.this_word, num_states))
return hmm_model
except:
if self.verbose:
print("failure on {} with {} states".format(self.this_word, num_states))
return None
class SelectorConstant(ModelSelector):
""" select the model with value self.n_constant
"""
def select(self):
""" select based on n_constant value
:return: GaussianHMM object
"""
best_num_components = self.n_constant
return self.base_model(best_num_components)
class SelectorBIC(ModelSelector):
"""
Abbreviations:
- BIC - Baysian Information Criterion
- CV - Cross-Validation
About BIC:
- Maximises the likelihood of data whilst penalising large-size models
- Used to scoring model topologies by balancing fit
and complexity within the training set for each word
- Avoids using CV by instead using a penalty term
BIC Equation: BIC = -2 * log L + p * log N
(re-arrangment of Equation (12) in Reference [0])
- where "L" is likelihood of "fitted" model
where "p" is the qty of free parameters in model (aka model "complexity"). Reference [2][3]
where "p * log N" is the "penalty term" (increases with higher "p"
to penalise "complexity" and avoid "overfitting")
where "N" is qty of data points (size of data set)
Notes:
-2 * log L -> decreases with higher "p"
p * log N -> increases with higher "p"
N > e^2 = 7.4 -> BIC applies larger "penalty term" in this case
Selection using BIC Model:
- Lower the BIC score the "better" the model.
- SelectorBIC accepts argument of ModelSelector instance of base class
with attributes such as: this_word, min_n_components, max_n_components,
- Loop from min_n_components to max_n_components
- Find the lowest BIC score as the "better" model.
References:
[0] - http://www2.imm.dtu.dk/courses/02433/doc/ch6_slides.pdf
[1] - https://en.wikipedia.org/wiki/Hidden_Markov_model#Architecture
[2] - https://stats.stackexchange.com/questions/12341/number-of-parameters-in-markov-model
[3] - https://discussions.udacity.com/t/number-of-parameters-bic-calculation/233235/8
[4] - http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.58.6208&rep=rep1&type=pdf
"""
# High model complexity (i.e. containing many paths) is penalised in BIC and is defined as the number of
# parameters yet to be estimated (aka free parameters) in the model that are used to fit specific data set
#
# p = num_free_params = ("transition probs" == n*n) + means(n*f) + covars(n*f).
#
# where num_free_params means "number of parameters yet to be estimated"
# where n means number of model states
# where f means number of data points (aka features) used to train the model (i.e. len(self.X[0]))
# where probs is an abbreviation for probabilities
#
# References:
# - Discussion about calculating number of free parameters - https://discussions.udacity.com/t/understanding-better-model-selection/232987/7
#
# p = num_free_params = init_state_occupation_probs + transition_probs + emission_probs
# = ( num_states ) +
# ( num_states * (num_states - 1) ) +
# ( num_states * num_data_points * 2 )
#
# Simplifying becomes:
# = ( num_states ** 2 ) + ( 2 * num_states * num_data_points )
#
# where init_state_occupation_probs = num_states
# where num_states = possible states a hidden variable at time t may be in
# where transition_probs = transition_params = num_states * (num_states - 1)
# References: https://en.wikipedia.org/wiki/Hidden_Markov_model#Architecture
# where emission_probs (aka output_probs) = num_states * num_data_points * 2
# = num_means + num_covars
# where num_means and num_covars are number of means and covars calculated
# (one of each for each state and feature
#
# References:
# - https://discussions.udacity.com/t/number-of-parameters-bic-calculation/233235/12
# - https://hmmlearn.readthedocs.io/en/latest/tutorial.html
#
# p = num_free_params * (num_states - 1) - num_zeros_in_transiton_matrix
#
# References:
# - https://discussions.udacity.com/t/number-of-parameters-bic-calculation/233235/11
# - https://stats.stackexchange.com/questions/12341/number-of-parameters-in-markov-model
def calc_num_free_params(self, num_states, num_data_points):
return ( num_states ** 2 ) + ( 2 * num_states * num_data_points ) - 1
def calc_score_bic(self, log_likelihood, num_free_params, num_data_points):
return (-2 * log_likelihood) + (num_free_params * np.log(num_data_points))
def calc_best_score_bic(self, score_bics):
# Min of list of lists comparing each item by value at index 0
return min(score_bics, key = lambda x: x[0])
def select(self):
""" Select best model for self.this_word based on BIC score
for n between self.min_n_components and self.max_n_components
:return: GaussianHMM object
"""
warnings.filterwarnings("ignore", category=DeprecationWarning)
score_bics = []
for num_states in range(self.min_n_components, self.max_n_components + 1):
try:
hmm_model = self.base_model(num_states)
log_likelihood = hmm_model.score(self.X, self.lengths)
num_data_points = sum(self.lengths)
num_free_params = self.calc_num_free_params(num_states, num_data_points)
score_bic = self.calc_score_bic(log_likelihood, num_free_params, num_data_points)
score_bics.append(tuple([score_bic, hmm_model]))
except:
pass
return self.calc_best_score_bic(score_bics)[1] if score_bics else None
class SelectorDIC(ModelSelector):
"""
Abbreviations:
- DIC - Discriminative Information Criterion
About DIC:
- In DIC we need to find the number of components where the difference is largest.
The idea of DIC is that we are trying to find the model that gives a
high likelihood (small negative number) to the original word and
low likelihood (very big negative number) to the other words
- In order to get an optimal model for any word, we need to run the model on all
other words so that we can calculate the formula
- DIC is a scoring model topology that scores the ability of a
training set to discriminate one word against competing words.
It provides a "penalty" if model likelihoods
for non-matching words are too similar to model likelihoods for the
correct word in the word set (rather than using a penalty term for
complexity like in BIC)
- Task-oriented model selection criterion adapts well to classification
problems
- Classification task accounts for Goal of model (differs from BIC)
DIC Equation:
DIC = log(P(X(i)) - 1/(M - 1) * sum(log(P(X(all but i))
(Equation (17) in Reference [0]. Assumes all data sets are same size)
= log likelihood of the data belonging to model
- avg of anti log likelihood of data X and model M
= log(P(original word)) - average(log(P(other words)))
where anti log likelihood means likelihood of data X and model M belonging to competing categories
where log(P(X(i))) is the log-likelihood of the fitted model for the current word
(in terms of hmmlearn it is the model's score for the current word)
where where "L" is likelihood of data fitting the model ("fitted" model)
where X is input training data given in the form of a word dictionary
where X(i) is the current word being evaluated
where M is a specific model
Note:
- log likelihood of the data belonging to model
- anti_log_likelihood of data X vs model M
Selection using DIC Model:
- Higher the DIC score the "better" the model.
- SelectorDIC accepts argument of ModelSelector instance of base class
with attributes such as: this_word, min_n_components, max_n_components,
- Loop from min_n_components to max_n_components
- Find the highest BIC score as the "better" model.
References:
[0] - http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.58.6208&rep=rep1&type=pdf
"""
# Calculate anti log likelihoods.
def calc_log_likelihood_other_words(self, model, other_words):
return [model[1].score(word[0], word[1]) for word in other_words]
def calc_best_score_dic(self, score_dics):
# Max of list of lists comparing each item by value at index 0
return max(score_dics, key = lambda x: x[0])
def select(self):
warnings.filterwarnings("ignore", category=DeprecationWarning)
other_words = []
models = []
score_dics = []
for word in self.words:
if word != self.this_word:
other_words.append(self.hwords[word])
try:
for num_states in range(self.min_n_components, self.max_n_components + 1):
hmm_model = self.base_model(num_states)
log_likelihood_original_word = hmm_model.score(self.X, self.lengths)
models.append((log_likelihood_original_word, hmm_model))
# Note: Situation that may cause exception may be if have more parameters to fit
# than there are samples, so must catch exception when the model is invalid
except Exception as e:
# logging.exception('DIC Exception occurred: ', e)
pass
for index, model in enumerate(models):
log_likelihood_original_word, hmm_model = model
score_dic = log_likelihood_original_word - np.mean(self.calc_log_likelihood_other_words(model, other_words))
score_dics.append(tuple([score_dic, model[1]]))
return self.calc_best_score_dic(score_dics)[1] if score_dics else None
class SelectorCV(ModelSelector):
"""
Abbreviations:
- CV - Cross-Validation
About CV:
- Scoring the model simply using Log Likelihood calculated from
feature sequences it trained on, we expect more complex models
to have higher likelihoods, but doesn't inform us which would
have a "better" likelihood score on unseen data. The model will
likely overfit as complexity is added.
- Estimate the "better" Topology model using only training data
by comparing scores using Cross-Validation (CV).
- CV technique includes breaking-down the training set into "folds",
rotating which fold is "left out" of the training set.
The fold that is "left out" is scored for validation.
Use this as a proxy method of finding the
"best" model to use on "unseen data".
e.g. Given a set of word sequences broken-down into three folds
using scikit-learn Kfold class object.
- CV useful to limit over-validation
CV Equation:
Selection using CV Model:
- Higher the CV score the "better" the model.
- Select "best" model based on average log Likelihood
of cross-validation folds
- Loop from min_n_components to max_n_components
- Find the higher score(logL), the higher the better.
- Score that is "best" for SelectorCV is the
average Log Likelihood of Cross-Validation (CV) folds.
References:
[0] - http://scikit-learn.org/stable/modules/generated/sklearn.model_selection.KFold.html
[1] - https://www.r-bloggers.com/aic-bic-vs-crossvalidation/
"""
def calc_best_score_cv(self, score_cv):
# Max of list of lists comparing each item by value at index 0
return max(score_cv, key = lambda x: x[0])
def select(self):
warnings.filterwarnings("ignore", category=DeprecationWarning)
# logging.debug("Sequences: %r" % self.sequences)
# num_splits = min(3, len(self.sequences))
kf = KFold(n_splits = 3, shuffle = False, random_state = None)
log_likelihoods = []
score_cvs = []
for num_states in range(self.min_n_components, self.max_n_components + 1):
try:
# Check sufficient data to split using KFold
if len(self.sequences) > 2:
# CV loop of breaking-down the sequence (training set) into "folds" where a fold
# rotated out of the training set is tested by scoring for Cross-Validation (CV)
for train_index, test_index in kf.split(self.sequences):
# print("TRAIN indices:", train_index, "TEST indices:", test_index)
# Training sequences split using KFold are recombined
self.X, self.lengths = combine_sequences(train_index, self.sequences)
# Test sequences split using KFold are recombined
X_test, lengths_test = combine_sequences(test_index, self.sequences)
hmm_model = self.base_model(num_states)
log_likelihood = hmm_model.score(X_test, lengths_test)
else:
hmm_model = self.base_model(num_states)
log_likelihood = hmm_model.score(self.X, self.lengths)
log_likelihoods.append(log_likelihood)
# Find average Log Likelihood of CV fold
score_cvs_avg = np.mean(log_likelihoods)
score_cvs.append(tuple([score_cvs_avg, hmm_model]))
except Exception as e:
pass
return self.calc_best_score_cv(score_cvs)[1] if score_cvs else None