update workflow and loss computation

uzh-dqbm-cmi · Aug 24, 2020 · f614b19 · f614b19
1 parent 550a571
commit f614b19
Showing 1 changed file with 255 additions and 21 deletions.
diff --git a/ddi/run_workflow.py b/ddi/run_workflow.py
@@ -66,7 +66,7 @@ def __repr__(self):
                                                                      self.num_epochs)
         return desc
 
-def generate_models_config(hyperparam_config, similarity_type, model_name, input_dim, fold_num, fdtype, loss_func='nllloss', margin=0.5):
+def generate_models_config(hyperparam_config, similarity_type, model_name, input_dim, fold_num, fdtype, loss_func='nllloss', margin=0.5, loss_w=0.5):
 
     dataloader_config = {'batch_size': hyperparam_config.batch_size,
                          'num_workers': 0}
@@ -84,11 +84,12 @@ def generate_models_config(hyperparam_config, similarity_type, model_name, input
                'fdtype':fdtype,
                'to_gpu':True,
                'loss_func':loss_func,
-               'contrastiveloss_margin':margin}
+               'contrastiveloss_margin':margin,
+               'loss_w':loss_w}
 
     return config, options
 
-def build_custom_config_map(similarity_type, model_name, loss_func='nllloss', margin=0.5):
+def build_custom_config_map(similarity_type, model_name, loss_func='nllloss', margin=0.5, loss_w=0.5):
     if(model_name == 'NDD'):
         hyperparam_config = NDDHyperparamConfig(400,300,0.5,0,200,20)
         input_dim = 1096
@@ -97,7 +98,14 @@ def build_custom_config_map(similarity_type, model_name, loss_func='nllloss', ma
         input_dim = 548
     fold_num = -1 
     fdtype = torch.float32
-    mconfig, options = generate_models_config(hyperparam_config, similarity_type, model_name, input_dim, fold_num, fdtype, loss_func=loss_func, margin=margin)
+    mconfig, options = generate_models_config(hyperparam_config, similarity_type, model_name, input_dim, fold_num, fdtype, loss_func=loss_func, margin=margin, loss_w=loss_w)
+    return mconfig, options
+
+def build_dditrf_config_map(input_dim, similarity_type, model_name, hyperparam_opt, loss_func='nllloss', margin=0.5, loss_w=0.5):
+    hyperparam_config = DDITrfHyperparamConfig(*hyperparam_opt)
+    fold_num = -1 
+    fdtype = torch.float32
+    mconfig, options = generate_models_config(hyperparam_config, similarity_type, model_name, input_dim, fold_num, fdtype, loss_func=loss_func, margin=margin, loss_w=loss_w)
     return mconfig, options
 
 def dump_dict_content(dsettype_content_map, dsettypes, desc, wrk_dir):
@@ -110,10 +118,10 @@ def get_random_fold(num_folds, random_seed=42):
     return fold_num
 
 def hyperparam_model_search(data_partitions, similarity_type, model_name,
-                            input_dim, root_dir, fold_gpu_map, loss_func='nllloss', margin=0.5,
+                            input_dim, root_dir, fold_gpu_map, 
+                            loss_func='nllloss', margin=0.5, loss_w=0.5,
                             fdtype=torch.float32, num_epochs=25,
-                            prob_interval_truemax=0.05, prob_estim=0.95, random_seed=42,
-                            per_base=False):
+                            prob_interval_truemax=0.05, prob_estim=0.95, random_seed=42):
     # fold_num = get_random_run(len(data_partitions), random_seed=random_seed)
     fold_num = get_random_fold(len(data_partitions), random_seed=random_seed)
     dsettypes = ['train', 'validation']
@@ -127,7 +135,8 @@ def hyperparam_model_search(data_partitions, similarity_type, model_name,
                                                   fold_num, 
                                                   fdtype, 
                                                   loss_func=loss_func,
-                                                  margin=margin)
+                                                  margin=margin,
+                                                  loss_w=loss_w)
         options['num_epochs'] = num_epochs # override number of ephocs here
         print("Running  {} config #{}".format(similarity_type, counter))
         path = os.path.join(root_dir, 'fold_{}'.format(fold_num), 'config_{}'.format(counter))
@@ -202,7 +211,7 @@ def run_ddi(data_partition, dsettypes, config, options, wrk_dir,
         # pytorch version >1.1, scheduler should be called after optimizer
         # for cyclical lr scheduler, it should be called after each batch update
         num_iter = len(data_loaders['train'])  # num_train_samples/batch_size
-        c_step_size = int(np.ceil(5*num_iter))  # this should be 2-10 times num_iter
+        c_step_size = int(np.ceil(2*num_iter))  # this should be 2-10 times num_iter
         base_lr = 3e-4
         max_lr = 5*base_lr  # 3-5 times base_lr
         cyc_scheduler = torch.optim.lr_scheduler.CyclicLR(optimizer, base_lr, max_lr, step_size_up=c_step_size,
@@ -309,6 +318,232 @@ def run_ddi(data_partition, dsettypes, config, options, wrk_dir,
 
 
 
+# def run_ddiTrf(data_partition, dsettypes, config, options, wrk_dir,
+#             state_dict_dir=None, to_gpu=True, gpu_index=0):
+#     pid = "{}".format(os.getpid())  # process id description
+#     # get data loader config
+#     dataloader_config = config['dataloader_config']
+#     cld = construct_load_dataloaders(data_partition, dsettypes, dataloader_config, wrk_dir)
+#     # dictionaries by dsettypes
+#     data_loaders, epoch_loss_avgbatch, score_dict, class_weights, flog_out = cld
+#     print(flog_out)
+#     # print(class_weights)
+#     device = get_device(to_gpu, gpu_index)  # gpu device
+#     fdtype = options['fdtype']
+
+#     if('train' in class_weights):
+#         class_weights = class_weights['train'].type(fdtype).to(device)  # update class weights to fdtype tensor
+#     else:
+#         class_weights = torch.tensor([1]*2).type(fdtype).to(device)  # weighting all casess equally
+
+#     print("class weights", class_weights)
+#     loss_func = torch.nn.NLLLoss(weight=class_weights, reduction='mean')  # negative log likelihood loss
+#     loss_contrastive = ContrastiveLoss(options.get('contrastiveloss_margin', 0.5), reduction='mean')
+#     # loss_contrastive = CosEmbLoss(options.get('contrastiveloss_margin', 0.5), reduction='mean')
+#     loss_contrastive.type(fdtype).to(device)
+#     # loss_attn = FeatureEmbAttention(1)
+#     # loss_attn.type(fdtype).to(device)
+
+#     num_epochs = options.get('num_epochs', 50)
+#     fold_num = options.get('fold_num')
+
+#     # parse config dict
+#     model_config = config['model_config']
+#     model_name = options['model_name']
+
+
+#     if(model_name == 'Transformer'):
+#         ddi_model = DDI_Transformer(input_size=options['input_dim'],
+#                                     input_embed_dim=model_config.input_embed_dim, 
+#                                     num_attn_heads=model_config.num_attn_heads, 
+#                                     mlp_embed_factor=model_config.mlp_embed_factor,
+#                                     nonlin_func=model_config.nonlin_func,
+#                                     pdropout=model_config.p_dropout, 
+#                                     num_transformer_units=model_config.num_transformer_units,
+#                                     pooling_mode=model_config.pooling_mode)
+#         ddi_siamese = DDI_SiameseTrf(options['input_dim'],model_config.dist_opt, num_classes=2)
+
+#         # ddi_siamese = DDI_SiameseTrf(model_config.input_embed_dim,model_config.dist_opt, num_classes=2)
+
+
+#     # define optimizer and group parameters
+#     models_param = list(ddi_model.parameters()) + list(ddi_siamese.parameters())
+#     models = [(ddi_model, model_name), (ddi_siamese, f'{model_name}_Siamese')]
+
+#     if(state_dict_dir):  # load state dictionary of saved models
+#         for m, m_name in models:
+#             m.load_state_dict(torch.load(os.path.join(state_dict_dir, '{}.pkl'.format(m_name)), map_location=device))
+
+#     # update models fdtype and move to device
+#     for m, m_name in models:
+#         m.type(fdtype).to(device)
+
+#     print('cool')
+#     if('train' in data_loaders):
+#         weight_decay = options.get('weight_decay', 1e-4)
+#         print('weight_decay', weight_decay)
+#         # split model params into attn parameters and other params
+#         # models_param = add_weight_decay_except_attn([ddi_model, ddi_siamese], weight_decay)
+#         # see paper Cyclical Learning rates for Training Neural Networks for parameters' choice
+#         # `https://arxive.org/pdf/1506.01186.pdf`
+#         # pytorch version >1.1, scheduler should be called after optimizer
+#         # for cyclical lr scheduler, it should be called after each batch update
+#         num_iter = len(data_loaders['train'])  # num_train_samples/batch_size
+#         c_step_size = int(np.ceil(5*num_iter))  # this should be 2-10 times num_iter
+#         base_lr = 3e-4
+#         max_lr = 5*base_lr  # 3-5 times base_lr
+#         print('max lr', max_lr)
+#         base_lr = 1e-2
+#         print('base_lr', base_lr)
+#         optimizer = torch.optim.Adam(models_param, weight_decay=weight_decay, lr=base_lr)
+#         # optimizer = torch.optim.Adam(models_param, lr=base_lr)
+#         # cyc_scheduler = torch.optim.lr_scheduler.CyclicLR(optimizer, base_lr, max_lr, step_size_up=c_step_size,
+#         #                                                 mode='triangular', cycle_momentum=False)
+#         # scheduler = torch.optim.lr_scheduler.OneCycleLR(optimizer, max_lr=max_lr, 
+#         #                                                 steps_per_epoch=num_iter, 
+#         #                                                 epochs=num_epochs)
+#         scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'max', patience=1, verbose=True)
+
+#     if ('validation' in data_loaders):
+#         m_state_dict_dir = create_directory(os.path.join(wrk_dir, 'model_statedict'))
+
+#     if(num_epochs > 1):
+#         fig_dir = create_directory(os.path.join(wrk_dir, 'figures'))
+
+#     # dump config dictionaries on disk
+#     config_dir = create_directory(os.path.join(wrk_dir, 'config'))
+#     ReaderWriter.dump_data(config, os.path.join(config_dir, 'mconfig.pkl'))
+#     ReaderWriter.dump_data(options, os.path.join(config_dir, 'exp_options.pkl'))
+#     # store attention weights for validation and test set
+#     seqid_fattnw_map = {dsettype: {'X_a':{}, 'X_b':{}} for dsettype in data_loaders if dsettype in {'test'}}
+#     pair_names = ('a', 'b')
+
+#     for epoch in range(num_epochs):
+#         # print("-"*35)
+#         for dsettype in dsettypes:
+#             print("device: {} | similarity_type: {} | fold_num: {} | epoch: {} | dsettype: {} | pid: {}"
+#                   "".format(device, options.get('similarity_type'), fold_num, epoch, dsettype, pid))
+#             pred_class = []
+#             ref_class = []
+#             prob_scores = []
+#             ddi_ids = []
+#             data_loader = data_loaders[dsettype]
+#             # total_num_samples = len(data_loader.dataset)
+#             epoch_loss = 0.
+
+#             if(dsettype == 'train'):  # should be only for train
+#                 for m, m_name in models:
+#                     m.train()
+#             else:
+#                 for m, m_name in models:
+#                     m.eval()
+
+#             for i_batch, samples_batch in enumerate(data_loader):
+#                 print('batch num:', i_batch)
+
+#                 # zero model grad
+#                 if(dsettype == 'train'):
+#                     optimizer.zero_grad()
+
+#                 X_a, X_b, y_batch, ids = samples_batch
+#                 # print(y_batch.shape)
+
+#                 X_a = X_a.to(device)
+#                 X_b = X_b.to(device)
+#                 y_batch = y_batch.reshape(-1) # TODO: reshape when preprocessing feature
+
+#                 y_batch = y_batch.type(torch.int64).to(device)
+#                 # print('ids', ids.shape, ids.dtype)
+
+#                 with torch.set_grad_enabled(dsettype == 'train'):
+#                     # print("number of examples in batch:", docs_batch.size(0))
+#                     num_samples_perbatch = X_a.size(0)
+#                     # print("number_samples_per_batch", num_samples_perbatch)
+#                     z_a, fattn_w_scores_a = ddi_model(X_a)
+#                     z_b, fattn_w_scores_b = ddi_model(X_b)
+
+#                     if(dsettype in seqid_fattnw_map and model_config.pooling_mode == 'attn'):
+#                         for l, attn_scores in enumerate((fattn_w_scores_a, fattn_w_scores_b)):
+#                             suffix = pair_names[l]
+#                             seqid_fattnw_map[dsettype][f'X_{suffix}'].update({sid.item():attn_scores[c].detach().cpu() for c, sid in enumerate(ids)})
+
+
+#                     logsoftmax_scores, dist = ddi_siamese(z_a, z_b)
+
+#                     __, y_pred_clss = torch.max(logsoftmax_scores, -1)
+
+#                     y_pred_prob  = torch.exp(logsoftmax_scores.detach().cpu()).numpy()
+
+#                     # print(y_pred_prob.shape)
+#                     pred_class.extend(y_pred_clss.view(-1).tolist())
+#                     ref_class.extend(y_batch.view(-1).tolist())
+#                     prob_scores.append(y_pred_prob)
+#                     # print(prob_scores)
+#                     ddi_ids.extend(ids.tolist())
+
+#                     cl = loss_func(logsoftmax_scores, y_batch)
+
+#                     dl = loss_contrastive(dist.reshape(-1), y_batch.type(fdtype))
+#                     # print(cl)
+#                     # print('cl', cl.shape)
+#                     # print('dl', dl.shape)
+#                     # cl.unsqueeze_(-1).unsqueeze_(-1)
+#                     # dl.unsqueeze_(-1).unsqueeze_(-1)
+#                     # # print('cl', cl.shape)
+#                     # loss, __ = loss_attn(torch.cat([cl,dl], axis=1))
+#                     # loss = loss.mean()
+#                     # # print(loss)
+
+#                     loss = cl + dl
+#                     # loss = cl
+#                     # loss = 0.8*loss_func(logsoftmax_scores, y_batch) + 0.2*loss_contrastive(dist.reshape(-1), y_batch)
+#                     # loss = loss_func(logsoftmax_scores, y_batch)
+
+#                     if(dsettype == 'train'):
+#                         # print("computing loss")
+#                         # backward step (i.e. compute gradients)
+#                         loss.backward()
+#                         # optimzer step -- update weights
+#                         optimizer.step()
+
+#                     epoch_loss += loss.item()
+
+#                     # torch.cuda.ipc_collect()
+#                     # torch.cuda.empty_cache()
+#             # end of epoch
+#             # print("+"*35)
+#             epoch_loss_avgbatch[dsettype].append(epoch_loss/len(data_loader))
+
+
+#             prob_scores_arr = np.concatenate(prob_scores, axis=0)
+#             # print(prob_scores_arr.shape)
+#             modelscore = perfmetric_report(pred_class, ref_class, prob_scores_arr[:,1], epoch, flog_out[dsettype])
+
+#             perf = modelscore.s_aupr
+#             if dsettype == 'validation':
+#                 scheduler.step(perf)
+#                 print('scheduler step for pef', perf)
+
+#             best_rec_score = score_dict[dsettype].s_aupr
+#             if(perf > best_rec_score):
+#                 score_dict[dsettype] = modelscore
+#                 if(dsettype == 'validation'):
+#                     for m, m_name in models:
+#                         torch.save(m.state_dict(), os.path.join(m_state_dict_dir, '{}.pkl'.format(m_name)))
+#                 elif(dsettype == 'test'):
+#                     # dump attention weights for the test data
+#                     dump_dict_content(seqid_fattnw_map, ['test'], 'sampleid_fattnw_map', wrk_dir)
+#                 if dsettype in {'test', 'validation'}:
+#                     predictions_df = build_predictions_df(ddi_ids, ref_class, pred_class, prob_scores_arr)
+#                     predictions_path = os.path.join(wrk_dir, f'predictions_{dsettype}.csv')
+#                     predictions_df.to_csv(predictions_path)
+
+#     if(num_epochs > 1):
+#         plot_loss(epoch_loss_avgbatch, fig_dir)
+#     # dump_scores
+#     dump_dict_content(score_dict, list(score_dict.keys()), 'score', wrk_dir)
+
+
 def run_ddiTrf(data_partition, dsettypes, config, options, wrk_dir,
             state_dict_dir=None, to_gpu=True, gpu_index=0):
     pid = "{}".format(os.getpid())  # process id description
@@ -330,10 +565,8 @@ def run_ddiTrf(data_partition, dsettypes, config, options, wrk_dir,
     print("class weights", class_weights)
     loss_func = torch.nn.NLLLoss(weight=class_weights, reduction='mean')  # negative log likelihood loss
     loss_contrastive = ContrastiveLoss(options.get('contrastiveloss_margin', 0.5), reduction='mean')
-    # loss_contrastive = CosEmbLoss(options.get('contrastiveloss_margin', 0.5), reduction='mean')
     loss_contrastive.type(fdtype).to(device)
-    # loss_attn = FeatureEmbAttention(1)
-    # loss_attn.type(fdtype).to(device)
+    loss_w = options.get('loss_w', 0.5)
 
     num_epochs = options.get('num_epochs', 50)
     fold_num = options.get('fold_num')
@@ -368,7 +601,8 @@ def run_ddiTrf(data_partition, dsettypes, config, options, wrk_dir,
     # update models fdtype and move to device
     for m, m_name in models:
         m.type(fdtype).to(device)
-
+
+    print('cool')
     if('train' in data_loaders):
         weight_decay = options.get('weight_decay', 1e-4)
         print('weight_decay', weight_decay)
@@ -478,7 +712,7 @@ def run_ddiTrf(data_partition, dsettypes, config, options, wrk_dir,
                     # loss = loss.mean()
                     # # print(loss)
 
-                    loss = cl + dl
+                    loss = loss_w*cl + (1-loss_w)*dl
                     # loss = cl
                     # loss = 0.8*loss_func(logsoftmax_scores, y_batch) + 0.2*loss_contrastive(dist.reshape(-1), y_batch)
                     # loss = loss_func(logsoftmax_scores, y_batch)
@@ -550,17 +784,17 @@ def generate_hyperparam_space(model_name):
         opt_lst = [fc1_dim, fc2_dim, dropout_vals, l2_reg_vals, batch_size_vals, num_epochs_vals]
     elif(model_name == 'Transformer'):
         # TODO: add the possible options for transformer model
-        embed_dim = [16,32,64,128]
-        num_attn_heads = [4,6,8]
-        num_transformer_units = [2]
+        embed_dim = [None]
+        num_attn_heads = [1,2]
+        num_transformer_units = [1,2]
         p_dropout = [0.1, 0.3, 0.5]
         nonlin_func = [nn.ReLU()]
         mlp_embed_factor = [2]
         pooling_mode = ['attn']
-        dist_opt = ['euclidean']
-        l2_reg = [1e-4, 1e-3, 1e-2]
-        batch_size = [4000]
-        num_epochs = [25]
+        dist_opt = ['cosine']
+        l2_reg = [1e-4, 1e-3,1e-5]
+        batch_size = [200, 2500]
+        num_epochs = [50]
         opt_lst = [embed_dim, num_attn_heads, 
                    num_transformer_units, p_dropout,
                    nonlin_func, mlp_embed_factor, pooling_mode, dist_opt,