From addf902cdff0ca31040cb2dd15d50ad65e7fa2a8 Mon Sep 17 00:00:00 2001 From: orisenbazuru Date: Mon, 15 Jun 2020 09:28:10 +0200 Subject: [PATCH] update for the model and loss function --- ddi/dataset.py | 35 ++++- ddi/model_attn.py | 78 +++++++++- ddi/run_workflow.py | 357 +++++++++++++++++++++++++++++++++++++------- ddi/utilities.py | 78 +++++++++- 4 files changed, 479 insertions(+), 69 deletions(-) diff --git a/ddi/dataset.py b/ddi/dataset.py index b55e772..05531fe 100644 --- a/ddi/dataset.py +++ b/ddi/dataset.py @@ -58,7 +58,6 @@ def construct_load_dataloaders(dataset_fold, dsettypes, config, wrk_dir): # setup data loaders data_loaders = {} epoch_loss_avgbatch = {} - epoch_loss_avgsamples = {} flog_out = {} score_dict = {} class_weights = {} @@ -75,19 +74,19 @@ def construct_load_dataloaders(dataset_fold, dsettypes, config, wrk_dir): num_workers=config['num_workers']) epoch_loss_avgbatch[dsettype] = [] - epoch_loss_avgsamples[dsettype] = [] score_dict[dsettype] = ModelScore(0, 0.0, 0.0, 0.0, 0.0, 0.0) # (best_epoch, auc, aupr, f1, precision, recall) if(wrk_dir): flog_out[dsettype] = os.path.join(wrk_dir, dsettype + ".log") else: flog_out[dsettype] = None - return (data_loaders, epoch_loss_avgbatch, epoch_loss_avgsamples, score_dict, class_weights, flog_out) + return (data_loaders, epoch_loss_avgbatch, score_dict, class_weights, flog_out) def preprocess_features(feat_fpath): X_fea = np.loadtxt(feat_fpath,dtype=float,delimiter=",") r, c = np.triu_indices(len(X_fea),1) # take indices off the diagnoal by 1 return np.concatenate((X_fea[r], X_fea[c]), axis=1) + def preprocess_labels(interaction_fpath): interaction_matrix = np.loadtxt(interaction_fpath,dtype=float,delimiter=",") r, c = np.triu_indices(len(interaction_matrix),1) # take indices off the diagnoal by 1 @@ -102,7 +101,7 @@ def create_setvector_features(X, num_sim_types): h = np.transpose(g, axes=(2,0, 1)) return h -def get_stratified_partitions(ddi_datatensor, num_folds=5, valid_set_portion=0.1, random_state=42): +def get_stratified_partitions(ddi_datatensor, num_folds=5, random_state=42): """Generate 5-fold stratified sample of drug-pair ids based on the interaction label Args: @@ -127,6 +126,32 @@ def get_stratified_partitions(ddi_datatensor, num_folds=5, valid_set_portion=0.1 print("-"*25) return(data_partitions) +def get_validation_partitions(ddi_datatensor, num_folds=2, valid_set_portion=0.1, random_state=42): + """Generate stratified train/validation split of drug-pair ids based on the interaction label + + Args: + ddi_datatensor: instance of :class:`DDIDataTensor` + """ + skf_trte = StratifiedShuffleSplit(n_splits=num_folds, + test_size=valid_set_portion, + random_state=random_state) # split train and test + data_partitions = {} + X = ddi_datatensor.X_feat + y = ddi_datatensor.y + fold_num = 0 + for train_index, test_index in skf_trte.split(X,y): + + data_partitions[fold_num] = {'train': train_index, + 'validation': test_index} + print("fold_num:", fold_num) + print('train data') + report_label_distrib(y[train_index]) + print('validation data') + report_label_distrib(y[test_index]) + print() + fold_num += 1 + print("-"*25) + return(data_partitions) def report_label_distrib(labels): classes, counts = np.unique(labels, return_counts=True) @@ -168,6 +193,8 @@ def generate_partition_datatensor(ddi_datatensor, data_partitions): target_ids = data_partitions[fold_num][dsettype] datatensor_partition = PartitionDataTensor(ddi_datatensor, target_ids, dsettype, fold_num) datatensor_partitions[fold_num][dsettype] = datatensor_partition + compute_class_weights_per_fold_(datatensor_partitions) + return(datatensor_partitions) def build_datatensor_partitions(data_partitions, ddi_datatensor): diff --git a/ddi/model_attn.py b/ddi/model_attn.py index a5035d3..0097c3c 100644 --- a/ddi/model_attn.py +++ b/ddi/model_attn.py @@ -108,10 +108,51 @@ def forward(self, X): return z +class FeatureEmbAttention(nn.Module): + def __init__(self, input_dim): + ''' + Args: + input_dim: int, size of the input vector (i.e. feature vector) + ''' + + super().__init__() + self.input_dim = input_dim + # use this as query vector against the transformer outputs + self.queryv = nn.Parameter(torch.randn(input_dim, dtype=torch.float32), requires_grad=True) + self.softmax = nn.Softmax(dim=1) # normalized across seqlen + + def forward(self, X): + '''Performs forward computation + Args: + X: torch.Tensor, (batch, ddi similarity type vector, feat_dim), dtype=torch.float32 + ''' + + X_scaled = X / (self.input_dim ** (1/4)) + queryv_scaled = self.queryv / (self.input_dim ** (1/4)) + # using matmul to compute tensor vector multiplication + + # (bsize, seqlen) + attn_weights = X_scaled.matmul(queryv_scaled) + + # softmax + attn_weights_norm = self.softmax(attn_weights) + + # reweighted value vectors (in this case reweighting the original input X) + # unsqueeze attn_weights_norm to get (bsize, 1, num similarity type vectors) + # perform batch multiplication with X that has shape (bsize, num similarity type vectors, feat_dim) + # result will be (bsize, 1, feat_dim) + # squeeze the result to obtain (bsize, feat_dim) + z = attn_weights_norm.unsqueeze(1).bmm(X).squeeze(1) + + # returns (bsize, feat_dim), (bsize, num similarity type vectors) + return z, attn_weights_norm + + class DDI_Transformer(nn.Module): def __init__(self, input_size=586, num_attn_heads=8, mlp_embed_factor=2, - nonlin_func=nn.ReLU(), pdropout=0.3, num_transformer_units=12, num_classes=1): + nonlin_func=nn.ReLU(), pdropout=0.3, num_transformer_units=12, + pooling_mode = 'attn', num_classes=2): super(DDI_Transformer, self).__init__() @@ -122,9 +163,25 @@ def __init__(self, input_size=586, num_attn_heads=8, mlp_embed_factor=2, self.trfunit_pipeline = nn.Sequential(*trfunit_layers) embed_size = input_size self.Wy = nn.Linear(embed_size, num_classes) + self.pooling_mode = pooling_mode + if pooling_mode == 'attn': + self.pooling = FeatureEmbAttention(input_size) + elif pooling_mode == 'mean': + self.pooling = torch.mean + # perform log softmax on the feature dimension - # self.log_softmax = nn.LogSoftmax(dim=-1) - + self.log_softmax = nn.LogSoftmax(dim=-1) + self._init_params_() + + + def _init_params_(self): + for p_name, p in self.named_parameters(): + param_dim = p.dim() + if param_dim > 1: # weight matrices + nn.init.xavier_uniform_(p) + elif param_dim == 1: # bias parameters + if p_name.endswith('bias'): + nn.init.uniform_(p, a=-1.0, b=1.0) def forward(self, X): """ @@ -137,7 +194,16 @@ def forward(self, X): # pool across similarity type vectors # Note: z.mean(dim=1) will change shape of z to become (batch, input_size) # we can keep dimension by running z.mean(dim=1, keepdim=True) to have (batch, 1, input_size) + + # pool across similarity type vectors + if self.pooling_mode == 'attn': + z, fattn_w_norm = self.pooling(z) + # Note: z.mean(dim=1) or self.pooling(z, dim=1) will change shape of z to become (batch, embedding dim) + # we can keep dimension by running z.mean(dim=1, keepdim=True) to have (batch, 1, embedding dim) + elif self.pooling_mode == 'mean': + z = self.pooling(z, dim=1) + fattn_w_norm = None + + y = self.Wy(z) - y = self.Wy(z.mean(dim=1)) - - return y \ No newline at end of file + return self.log_softmax(y), fattn_w_norm \ No newline at end of file diff --git a/ddi/run_workflow.py b/ddi/run_workflow.py index 8e6b737..c21c202 100644 --- a/ddi/run_workflow.py +++ b/ddi/run_workflow.py @@ -58,20 +58,13 @@ def __repr__(self): self.num_epochs) return desc -def generate_models_config(hyperparam_config, similarity_type, model_name, input_dim, fold_num, fdtype): +def generate_models_config(hyperparam_config, similarity_type, model_name, input_dim, fold_num, fdtype, loss_func='nllloss'): - - # currently generic_config is shared across all models - # leaving it as placeholder such that custom generic configs could be passed :) - - - - generic_config = {'fdtype':fdtype, 'to_gpu':True} dataloader_config = {'batch_size': hyperparam_config.batch_size, 'num_workers': 0} + config = {'dataloader_config': dataloader_config, - 'model_config': hyperparam_config, - 'generic_config': generic_config + 'model_config': hyperparam_config } options = {'similarity_type': similarity_type, @@ -79,11 +72,14 @@ def generate_models_config(hyperparam_config, similarity_type, model_name, input 'input_dim': input_dim, 'model_name': model_name, 'num_epochs': hyperparam_config.num_epochs, - 'weight_decay': hyperparam_config.l2_reg} + 'weight_decay': hyperparam_config.l2_reg, + 'fdtype':fdtype, + 'to_gpu':True, + 'loss_func':loss_func} return config, options -def build_custom_config_map(similarity_type, model_name): +def build_custom_config_map(similarity_type, model_name, loss_func='nllloss'): if(model_name == 'NDD'): hyperparam_config = NDDHyperparamConfig(400,300,0.5,0,200,20) input_dim = 1096 @@ -92,7 +88,7 @@ def build_custom_config_map(similarity_type, model_name): input_dim = 548 fold_num = -1 fdtype = torch.float32 - mconfig, options = generate_models_config(hyperparam_config, similarity_type, model_name, input_dim, fold_num, fdtype) + mconfig, options = generate_models_config(hyperparam_config, similarity_type, model_name, input_dim, fold_num, fdtype, loss_func=loss_func) return mconfig, options def dump_dict_content(dsettype_content_map, dsettypes, desc, wrk_dir): @@ -100,6 +96,40 @@ def dump_dict_content(dsettype_content_map, dsettypes, desc, wrk_dir): path = os.path.join(wrk_dir, '{}_{}.pkl'.format(desc, dsettype)) ReaderWriter.dump_data(dsettype_content_map[dsettype], path) +def hyperparam_model_search(data_partitions, similarity_type, model_name, + input_dim, root_dir, run_gpu_map, loss_func='nllloss', + fdtype=torch.float32, num_epochs=25, + prob_interval_truemax=0.05, prob_estim=0.95, random_seed=42, + per_base=False): + # fold_num = get_random_run(len(data_partitions), random_seed=random_seed) + fold_num = 0 + dsettypes = ['train', 'validation'] + hyperparam_options = get_hyperparam_options(prob_interval_truemax, prob_estim, model_name) + data_partition = data_partitions[run_num] + for counter, hyperparam_config in enumerate(hyperparam_options): + mconfig, options = generate_models_config(hyperparam_config, + similarity_type, + model_name, + input_dim, + fold_num, fdtype, + loss_func=loss_func) + options['num_epochs'] = num_epochs # override number of ephocs here + print("Running experiment {} config #{}".format(experiment_desc, counter)) + path = os.path.join(root_dir, + 'run_{}'.format(run_num), + 'config_{}'.format(counter)) + wrk_dir = create_directory(path) + + if options.get('loss_func') == 'bceloss': + run_ddi(data_partition, dsettypes, mconfig, options, wrk_dir, + state_dict_dir=None, to_gpu=True, + gpu_index=fold_gpu_map[fold_num]) + elif options.get('loss_func') == 'nllloss': + run_ddiTrf(data_partition, dsettypes, mconfig, options, wrk_dir, + state_dict_dir=None, to_gpu=True, + gpu_index=fold_gpu_map[fold_num]) + + print("-"*15) def run_ddi(data_partition, dsettypes, config, options, wrk_dir, state_dict_dir=None, to_gpu=True, gpu_index=0): @@ -108,19 +138,17 @@ def run_ddi(data_partition, dsettypes, config, options, wrk_dir, dataloader_config = config['dataloader_config'] cld = construct_load_dataloaders(data_partition, dsettypes, dataloader_config, wrk_dir) # dictionaries by dsettypes - data_loaders, epoch_loss_avgbatch, epoch_loss_avgsamples, score_dict, class_weights, flog_out = cld + data_loaders, epoch_loss_avgbatch, score_dict, class_weights, flog_out = cld # print(flog_out) # print(class_weights) device = get_device(to_gpu, gpu_index) # gpu device - generic_config = config['generic_config'] - fdtype = generic_config['fdtype'] + fdtype = options['fdtype'] if('train' in class_weights): class_weights = class_weights['train'][1].type(fdtype).to(device) # update class weights to fdtype tensor else: class_weights = torch.tensor([1]).type(fdtype).to(device) # weighting all casess equally print("class weights", class_weights) - # loss_func = torch.nn.NLLLoss(weight=class_weights, reduction='mean') # negative log likelihood loss # binary cross entropy loss_func = torch.nn.BCEWithLogitsLoss(pos_weight=class_weights, reduction='mean') @@ -139,15 +167,7 @@ def run_ddi(data_partition, dsettypes, config, options, wrk_dir, H2=model_config.fc2_dim, D_out=1, drop=model_config.p_dropout) - elif(model_name == 'Transformer'): - ddi_model = DDI_Transformer(input_size=options['input_dim'], - num_attn_heads=model_config.num_attn_heads, - mlp_embed_factor=model_config.mlp_embed_factor, - nonlin_func=model_config.nonlin_func, - pdropout=model_config.p_dropout, - num_transformer_units=model_config.num_transformer_units) - # define optimizer and group parameters models_param = list(ddi_model.parameters()) models = [(ddi_model, model_name)] @@ -198,7 +218,6 @@ def run_ddi(data_partition, dsettypes, config, options, wrk_dir, data_loader = data_loaders[dsettype] # total_num_samples = len(data_loader.dataset) epoch_loss = 0. - epoch_loss_deavrg = 0. if(dsettype == 'train'): # should be only for train for m, m_name in models: @@ -250,44 +269,216 @@ def run_ddi(data_partition, dsettypes, config, options, wrk_dir, # after each batch step the scheduler cyc_scheduler.step() epoch_loss += loss.item() - # deaverage the loss to deal with last batch with unequal size - epoch_loss_deavrg += loss.item() * num_samples_perbatch # torch.cuda.ipc_collect() # torch.cuda.empty_cache() # end of epoch # print("+"*35) epoch_loss_avgbatch[dsettype].append(epoch_loss/len(data_loader)) - epoch_loss_avgsamples[dsettype].append(epoch_loss_deavrg/len(data_loader.dataset)) - modelscore = perfmetric_report(pred_class, ref_class, prob_scores, epoch+1, flog_out[dsettype]) - perf = modelscore.s_auc - if(perf > score_dict[dsettype].s_auc): + prob_scores_arr = np.concatenate(prob_scores, axis=0) + + perf = modelscore.s_aupr + if(perf > score_dict[dsettype].s_aupr): score_dict[dsettype] = modelscore - for m, m_name in models: - torch.save(m.state_dict(), os.path.join(m_state_dict_dir, '{}_{}.pkl'.format(m_name, (epoch+1)))) + if(dsettype == 'validation'): + for m, m_name in models: + torch.save(m.state_dict(), os.path.join(m_state_dict_dir, '{}.pkl'.format(m_name))) + if dsettype in {'test', 'validation'}: + predictions_df = build_predictions_df(ddi_ids, ref_class, pred_class, prob_scores_arr) + predictions_path = os.path.join(wrk_dir, f'predictions_{dsettype}.csv') + predictions_df.to_csv(predictions_path) if(num_epochs > 1): - plot_loss(epoch_loss_avgbatch, epoch_loss_avgsamples, fig_dir) - + plot_loss(epoch_loss_avgbatch, fig_dir) # dump_scores dump_dict_content(score_dict, list(score_dict.keys()), 'score', wrk_dir) - # this will run once - if(dsettype == 'test'): - # save predictions - predictions_df = build_predictions_df(ddi_ids, ref_class, pred_class, prob_scores) - predictions_path = os.path.join(wrk_dir, 'predictions.csv') - predictions_df.to_csv(predictions_path) - # return ref_class, pred_class, prob_scores + + +def run_ddiTrf(data_partition, dsettypes, config, options, wrk_dir, + state_dict_dir=None, to_gpu=True, gpu_index=0): + pid = "{}".format(os.getpid()) # process id description + # get data loader config + dataloader_config = config['dataloader_config'] + cld = construct_load_dataloaders(data_partition, dsettypes, dataloader_config, wrk_dir) + # dictionaries by dsettypes + data_loaders, epoch_loss_avgbatch, score_dict, class_weights, flog_out = cld + # print(flog_out) + # print(class_weights) + device = get_device(to_gpu, gpu_index) # gpu device + fdtype = options['fdtype'] + + if('train' in class_weights): + class_weights = class_weights['train'].type(fdtype).to(device) # update class weights to fdtype tensor + else: + class_weights = torch.tensor([1]*num_classes).type(fdtype).to(device) # weighting all casess equally + + print("class weights", class_weights) + loss_func = torch.nn.NLLLoss(weight=class_weights, reduction='mean') # negative log likelihood loss + + num_epochs = options.get('num_epochs', 50) + fold_num = options.get('fold_num') + + # parse config dict + model_config = config['model_config'] + model_name = options['model_name'] + + + if(model_name == 'Transformer'): + ddi_model = DDI_Transformer(input_size=options['input_dim'], + num_attn_heads=model_config.num_attn_heads, + mlp_embed_factor=model_config.mlp_embed_factor, + nonlin_func=model_config.nonlin_func, + pdropout=model_config.p_dropout, + num_transformer_units=model_config.num_transformer_units, + pooling_mode=model_config.pooling_mode, + num_classes=2) + + # define optimizer and group parameters + models_param = list(ddi_model.parameters()) + models = [(ddi_model, model_name)] + + if(state_dict_dir): # load state dictionary of saved models + for m, m_name in models: + m.load_state_dict(torch.load(os.path.join(state_dict_dir, '{}.pkl'.format(m_name)), map_location=device)) + + # update models fdtype and move to device + for m, m_name in models: + m.type(fdtype).to(device) + + if('train' in data_loaders): + weight_decay = options.get('weight_decay', 1e-4) + print('weight_decay', weight_decay) + # see paper Cyclical Learning rates for Training Neural Networks for parameters' choice + # `https://arxive.org/pdf/1506.01186.pdf` + # pytorch version >1.1, scheduler should be called after optimizer + # for cyclical lr scheduler, it should be called after each batch update + num_iter = len(data_loaders['train']) # num_train_samples/batch_size + c_step_size = int(np.ceil(5*num_iter)) # this should be 2-10 times num_iter + base_lr = 3e-4 + max_lr = 5*base_lr # 3-5 times base_lr + optimizer = torch.optim.Adam(models_param, weight_decay=weight_decay, lr=base_lr) + cyc_scheduler = torch.optim.lr_scheduler.CyclicLR(optimizer, base_lr, max_lr, step_size_up=c_step_size, + mode='triangular', cycle_momentum=False) + + if ('validation' in data_loaders): + m_state_dict_dir = create_directory(os.path.join(wrk_dir, 'model_statedict')) + + if(num_epochs > 1): + fig_dir = create_directory(os.path.join(wrk_dir, 'figures')) + + # dump config dictionaries on disk + config_dir = create_directory(os.path.join(wrk_dir, 'config')) + ReaderWriter.dump_data(config, os.path.join(config_dir, 'mconfig.pkl')) + ReaderWriter.dump_data(options, os.path.join(config_dir, 'exp_options.pkl')) + # store attention weights for validation and test set + seqid_fattnw_map = {dsettype: {} for dsettype in data_loaders if dsettype in {'test'}} + + for epoch in range(num_epochs): + # print("-"*35) + for dsettype in dsettypes: + print("device: {} | similarity_type: {} | fold_num: {} | epoch: {} | dsettype: {} | pid: {}" + "".format(device, options.get('similarity_type'), fold_num, epoch, dsettype, pid)) + pred_class = [] + ref_class = [] + prob_scores = [] + ddi_ids = [] + data_loader = data_loaders[dsettype] + # total_num_samples = len(data_loader.dataset) + epoch_loss = 0. + + if(dsettype == 'train'): # should be only for train + for m, m_name in models: + m.train() + else: + for m, m_name in models: + m.eval() + + for i_batch, samples_batch in enumerate(data_loader): + print('batch num:', i_batch) + + # zero model grad + if(dsettype == 'train'): + optimizer.zero_grad() + + X_batch, y_batch, ids = samples_batch + + X_batch = X_batch.to(device) + y_batch = y_batch.reshape(-1, 1) # TODO: reshape when preprocessing feature + y_batch = y_batch.type(torch.int64).to(device) + # print('ids', ids.shape, ids.dtype) + + with torch.set_grad_enabled(dsettype == 'train'): + # print("number of examples in batch:", docs_batch.size(0)) + num_samples_perbatch = X_batch.size(0) + # print("number_samples_per_batch", num_samples_perbatch) + logsoftmax_scores, fattn_w_scores = ddi_model(X_batch) + + if(dsettype in seqid_fattnw_map): + seqid_fattnw_map[dsettype].update({sid:fattn_w_scores[c].detach().cpu() for c, sid in enumerate(ids)}) + + __, y_pred_clss = torch.max(logsoftmax_scores, -1) + + y_pred_prob = torch.exp(logsoftmax_scores.detach().cpu()).numpy() + + + pred_class.extend(y_pred_clss.view(-1).tolist()) + ref_class.extend(y_batch.view(-1).tolist()) + prob_scores.extend(y_pred_prob.tolist()) + ddi_ids.extend(ids.tolist()) + + loss = loss_func(logsoftmax_scores, y_batch) + if(dsettype == 'train'): + # print("computing loss") + # backward step (i.e. compute gradients) + loss.backward() + # optimzer step -- update weights + optimizer.step() + # after each batch step the scheduler + cyc_scheduler.step() + epoch_loss += loss.item() + + # torch.cuda.ipc_collect() + # torch.cuda.empty_cache() + # end of epoch + # print("+"*35) + epoch_loss_avgbatch[dsettype].append(epoch_loss/len(data_loader)) + modelscore = perfmetric_report(pred_class, ref_class, prob_scores, epoch, flog_out[dsettype]) + prob_scores_arr = np.concatenate(prob_scores, axis=0) + + perf = modelscore.aupr + best_rec_score = score_dict[dsettype].aupr + if(perf > best_rec_score): + score_dict[dsettype] = modelscore + if(dsettype == 'validation'): + for m, m_name in models: + torch.save(m.state_dict(), os.path.join(m_state_dict_dir, '{}.pkl'.format(m_name))) + elif(dsettype == 'test'): + # dump attention weights for the test data + dump_dict_content(seqid_fattnw_map, ['test'], 'sampleid_fattnw_map', wrk_dir) + if dsettype in {'test', 'validation'}: + predictions_df = build_predictions_df(ddi_ids, ref_class, pred_class, prob_scores_arr) + predictions_path = os.path.join(wrk_dir, f'predictions_{dsettype}.csv') + predictions_df.to_csv(predictions_path) + + if(num_epochs > 1): + plot_loss(epoch_loss_avgbatch, fig_dir) + # dump_scores + dump_dict_content(score_dict, list(score_dict.keys()), 'score', wrk_dir) def build_predictions_df(ids, true_class, pred_class, prob_scores): + + prob_scores_dict = {} + for i in range (prob_scores.shape[-1]): + prob_scores_dict[f'prob_score_class{i}'] = prob_scores[:, i] + df_dict = { 'id': ids, 'true_class': true_class, - 'pred_class': pred_class, - 'prob_score_class1': prob_scores, + 'pred_class': pred_class } + df_dict.update(prob_scores_dict) predictions_df = pd.DataFrame(df_dict) predictions_df.set_index('id', inplace=True) return predictions_df @@ -304,7 +495,19 @@ def generate_hyperparam_space(model_name): opt_lst = [fc1_dim, fc2_dim, dropout_vals, l2_reg_vals, batch_size_vals, num_epochs_vals] elif(model_name == 'Transformer'): # TODO: add the possible options for transformer model - pass + embed_dim = [16,32,64,128] + num_attn_heads = [4,6,8] + num_transformer_units = [2] + p_dropout = [0.1, 0.3, 0.5] + nonlin_func = [nn.ReLU()] + mlp_embed_factor = [2] + l2_reg = [1e-4, 1e-3, 1e-2] + batch_size = [4000] + num_epochs = [25] + opt_lst = [embed_dim, num_attn_heads, + num_transformer_units, p_dropout, + nonlin_func, mlp_embed_factor, + l2_reg, batch_size, num_epochs] hyperparam_space = list(itertools.product(*opt_lst)) @@ -358,6 +561,46 @@ def get_index_argmax(score_matrix, target_indx): argmax_indx = np.argmax(score_matrix, axis=0)[target_indx] return argmax_indx +def get_best_config_from_hyperparamsearch(hyperparam_search_dir, num_runs=5, num_trials=60, num_metrics=5, metric_indx=4, random_seed=42): + """Read best models config from all models tested in hyperparamsearch phase + Args: + questions: list, of questions [4,5,9,10,11] + hyperparam_search_dir: string, path root directory where hyperparam models are stored + num_trials: int, number of tested models (default 60 based on 0.05 interval and 0.95 confidence interval) + see :func: `compute_numtrials` + metric_indx:int, (default 4) using AUC as performance metric to evaluate among the tested models + """ + # determine best config from hyperparam search +# run_num = get_random_run(num_runs, random_seed=random_seed) + fold_num = 0 + run_dir = os.path.join(hyperparam_search_dir, 'fold_{}'.format(run_num)) + + scores = np.ones((num_trials, num_metrics))*-1 + exist_flag = False + + for config_num in range(num_trials): + score_file = os.path.join(run_dir, 'config_{}'.format(config_num), 'score_validation.pkl') + if(os.path.isfile(score_file)): + try: + mscore = ReaderWriter.read_data(score_file) + print(mscore) + scores[config_num, 0] = mscore.best_epoch_indx + scores[config_num, 1] = mscore.binary_f1 + scores[config_num, 2] = mscore.macro_f1 + scores[config_num, 3] = mscore.aupr + scores[config_num, 4] = mscore.auc + exist_flag = True + except Exception as e: + print(f'exception occured at config_{config_num}') + continue + else: + print("WARNING: hyperparam search dir does not exist: {}".format(score_file)) + if(exist_flag): + argmax_indx = get_index_argmax(scores, metric_indx) + mconfig, options = get_saved_config(os.path.join(run_dir, 'config_{}'.format(argmax_indx), 'config')) + return mconfig, options, argmax_indx, scores + + return None def train_val_run(datatensor_partitions, config_map, train_val_dir, fold_gpu_map, num_epochs=20): dsettypes = ['train'] @@ -373,8 +616,15 @@ def train_val_run(datatensor_partitions, config_map, train_val_dir, fold_gpu_map wrk_dir = create_directory(path) # print(wrk_dir) # wrk_dir = create_directory('fold_{}'.format(fold_num),create_directory('train_val_{}'.format(similarity_type), train_val_dir)) - run_ddi(data_partition, dsettypes, mconfig, options, wrk_dir, - state_dict_dir=None, to_gpu=True, gpu_index=fold_gpu_map[fold_num]) + if options.get('loss_func') == 'bceloss': + run_ddi(data_partition, dsettypes, mconfig, options, wrk_dir, + state_dict_dir=None, to_gpu=True, + gpu_index=fold_gpu_map[fold_num]) + elif options.get('loss_func') == 'nllloss': + run_ddiTrf(data_partition, dsettypes, mconfig, options, wrk_dir, + state_dict_dir=None, to_gpu=True, + gpu_index=fold_gpu_map[fold_num]) + def test_run(datatensor_partitions, config_map, train_val_dir, test_dir, fold_gpu_map, num_epochs=1): @@ -392,9 +642,14 @@ def test_run(datatensor_partitions, config_map, train_val_dir, test_dir, fold_gp state_dict_pth = os.path.join(train_dir, 'model_statedict') path = os.path.join(test_dir, 'test_{}'.format(similarity_type), 'fold_{}'.format(fold_num)) test_wrk_dir = create_directory(path) - run_ddi(data_partition, dsettypes, mconfig, options, test_wrk_dir, - state_dict_dir=state_dict_pth, to_gpu=True, - gpu_index=fold_gpu_map[fold_num]) + if options.get('loss_func') == 'bceloss': + run_ddi(data_partition, dsettypes, mconfig, options, test_wrk_dir, + state_dict_dir=state_dict_pth, to_gpu=True, + gpu_index=fold_gpu_map[fold_num]) + elif options.get('loss_func') == 'nllloss': + run_ddiTrf(data_partition, dsettypes, mconfig, options, test_wrk_dir, + state_dict_dir=state_dict_pth, to_gpu=True, + gpu_index=fold_gpu_map[fold_num]) else: print('WARNING: train dir not found: {}'.format(path)) diff --git a/ddi/utilities.py b/ddi/utilities.py index bd7d329..1ea6667 100644 --- a/ddi/utilities.py +++ b/ddi/utilities.py @@ -5,7 +5,7 @@ import numpy as np import pandas as pd from sklearn.metrics import classification_report, f1_score, roc_curve, precision_recall_curve, accuracy_score, \ - recall_score, precision_score, roc_auc_score, auc + recall_score, precision_score, roc_auc_score, auc, average_precision_score from matplotlib import pyplot as plt @@ -201,7 +201,7 @@ def get_interaction_stat(matrix): print('number of zero elements', zero_elem) print('diagnoal elements ', np.diag(matrix)) -def perfmetric_report(pred_target, ref_target, probscore, epoch, outlog, plot_roc=True): +def perfmetric_report(pred_target, ref_target, probscore, epoch, outlog): # print(ref_target.shape) # print(pred_target.shape) @@ -242,10 +242,10 @@ def perfmetric_report(pred_target, ref_target, probscore, epoch, outlog, plot_ro def plot_precision_recall_curve(ref_target, prob_poslabel, figname, outdir): pr, rec, thresholds = precision_recall_curve(ref_target, prob_poslabel) + avg_precision = average_precision_score(ref_target, prob_poslabel) thresholds[0] = 1 plt.figure(figsize=(9, 6)) - plt.plot(pr, rec, 'bo', label='Precision vs Recall') - # plt.plot(np.arange(0,len(thresholds)), thresholds, 'r-', label='thresholds') + plt.plot(rec, pr, 'b+', label=f'Average Precision (AP):{avg_precision:.2}') plt.xlabel('Precision') plt.ylabel('Recall') plt.title('Precision vs. recall curve') @@ -258,7 +258,7 @@ def plot_roc_curve(ref_target, prob_poslabel, figname, outdir): fpr, tpr, thresholds = roc_curve(ref_target, prob_poslabel) thresholds[0] = 1 plt.figure(figsize=(9, 6)) - plt.plot(fpr, tpr, 'bo', label='TPR vs FPR') + plt.plot(fpr, tpr, 'b+', label='TPR vs FPR') plt.plot(fpr, thresholds, 'r-', label='thresholds') plt.xlabel('False positive rate') plt.ylabel('True positive rate') @@ -268,18 +268,80 @@ def plot_roc_curve(ref_target, prob_poslabel, figname, outdir): plt.close() -def plot_loss(epoch_loss_avgbatch, epoch_loss_avgsamples, wrk_dir): +# def plot_loss(epoch_loss_avgbatch, epoch_loss_avgsamples, wrk_dir): +# dsettypes = epoch_loss_avgbatch.keys() +# for dsettype in dsettypes: +# plt.figure(figsize=(9, 6)) +# plt.plot(epoch_loss_avgbatch[dsettype], 'r', epoch_loss_avgsamples[dsettype], 'b') +# plt.xlabel("number of epochs") +# plt.ylabel("negative loglikelihood cost") +# plt.legend(['epoch batch average loss', 'epoch training samples average loss']) +# plt.savefig(os.path.join(wrk_dir, os.path.join(dsettype + ".pdf"))) +# plt.close() + +def plot_loss(epoch_loss_avgbatch, wrk_dir): dsettypes = epoch_loss_avgbatch.keys() for dsettype in dsettypes: plt.figure(figsize=(9, 6)) - plt.plot(epoch_loss_avgbatch[dsettype], 'r', epoch_loss_avgsamples[dsettype], 'b') + plt.plot(epoch_loss_avgbatch[dsettype], 'r') plt.xlabel("number of epochs") plt.ylabel("negative loglikelihood cost") - plt.legend(['epoch batch average loss', 'epoch training samples average loss']) + plt.legend(['epoch batch average loss']) plt.savefig(os.path.join(wrk_dir, os.path.join(dsettype + ".pdf"))) plt.close() +def plot_xy(x, y, xlabel, ylabel, legend, fname, wrk_dir): + plt.figure(figsize=(9, 6)) + plt.plot(x, y, 'r') + plt.xlabel(xlabel) + plt.ylabel(ylabel) + if legend: + plt.legend([legend]) + plt.savefig(os.path.join(wrk_dir, os.path.join(fname + ".pdf"))) + plt.close() + +def find_youdenj_threshold(ref_target, prob_poslabel, fig_dir=None): + fpr, tpr, thresholds = roc_curve(ref_target, prob_poslabel) + s_auc = roc_auc_score(ref_target, prob_poslabel) + thresholds[0] = 1 + plt.figure(figsize=(9, 6)) + plt.plot(fpr, tpr, 'b+', label=f'TPR vs FPR => AUC:{s_auc:.2}') +# plt.plot(fpr, thresholds, 'r-', label='thresholds') + plt.xlabel('False positive rate') + plt.ylabel('True positive rate') + plt.title('ROC curve') + # youden index is max(sensitivity + specificity - 1) + # max(tpr + (1-fpr) - 1) + # max(tpr - fpr) + youden_indx = np.argmax(tpr - fpr) # the index where the difference between tpr and fpr is max + optimal_threshold = thresholds[youden_indx] + plt.plot(fpr[youden_indx], tpr[youden_indx], marker='o', markersize=3, color="red", label=f'optimal probability threshold:{optimal_threshold:.2}') + plt.legend(loc='best') + if fig_dir: + plt.savefig(f'{fig_dir}.pdf') + plt.close() + return fpr, tpr, thresholds, optimal_threshold + +def analyze_precision_recall_curve(ref_target, prob_poslabel, fig_dir=None): + pr, rec, thresholds = precision_recall_curve(ref_target, prob_poslabel) + avg_precision = average_precision_score(ref_target, prob_poslabel) + thresholds[0] = 1 + plt.figure(figsize=(9, 6)) + plt.plot(rec, pr, 'b+', label=f'Precision vs Recall => Average Precision (AP):{avg_precision:.2}') + plt.xlabel('Recall') + plt.ylabel('Precision') + plt.title('Precision vs. recall curve') + indx = np.argmax(pr + rec) + print('indx', indx) + optimal_threshold = thresholds[indx] + plt.plot(rec[indx], pr[indx], marker='o', markersize=3, color="red", label=f'optimal probability threshold:{optimal_threshold:.2}') + plt.legend(loc='best') + if fig_dir: + plt.savefig(f'{fig_dir}.pdf') + plt.close() + return pr, rec, thresholds, optimal_threshold + def delete_directory(directory): if(os.path.isdir(directory)): shutil.rmtree(directory)